# GPFgrowth is algorithm to mine the partial periodic frequent pattern in temporal database.
#
#
# **Importing this algorithm into a python program**
# --------------------------------------------------------
#
# from PAMI.partialPeriodicFrequentPattern.basic import GPFgrowth as alg
#
# obj = alg.GPFgrowth(inputFile, outputFile, minSup, maxPer, minPR)
#
# obj.startMine()
#
# partialPeriodicFrequentPatterns = obj.getPatterns()
#
# print("Total number of partial periodic Patterns:", len(partialPeriodicFrequentPatterns))
#
# obj.save(oFile)
#
# Df = obj.getPatternInDf()
#
# memUSS = obj.getMemoryUSS()
#
# print("Total Memory in USS:", memUSS)
#
# memRSS = obj.getMemoryRSS()
#
# print("Total Memory in RSS", memRSS)
#
# run = obj.getRuntime()
#
# print("Total ExecutionTime in seconds:", run)
#
__copyright__ = """
Copyright (C) 2021 Rage Uday Kiran
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Copyright (C) 2021 Rage Uday Kiran
"""
import sys
from PAMI.partialPeriodicFrequentPattern.basic.abstract import *
orderOfItem = {}
[docs]
class Node:
"""
A class used to represent the node of frequentPatternTree
:Attributes:
item : int
storing item of a node
parent : node
To maintain the parent of every node
child : list
To maintain the children of node
nodeLink : node
To maintain the next node of node
tidList : set
To maintain timestamps of node
:Methods:
getChild(itemName)
storing the children to their respective parent nodes
"""
def __init__(self):
self.item = -1
self.parent = None
self.child = []
self.nodeLink = None
self.tidList = set()
[docs]
def getChild(self, item):
"""
:param item:
:return: if node have node of item, then return it. if node don't have return []
"""
for child in self.child:
if child.item == item:
return child
return []
[docs]
class Tree:
"""
A class used to represent the frequentPatternGrowth tree structure
:Attributes:
root : node
Represents the root node of the tree
nodeLinks : dictionary
storing last node of each item
firstNodeLink : dictionary
storing first node of each item
:Methods:
addTransaction(transaction,timeStamp)
creating transaction as a branch in frequentPatternTree
fixNodeLinks(itemName, newNode)
add newNode link after last node of item
deleteNode(itemName)
delete all node of item
createPrefixTree(path,timeStampList)
create prefix tree by path
createConditionalTree(PFList, minSup, maxPer, minPR, last)
create conditional tree. Its nodes are satisfy IP / (minSup+1) >= minPR
"""
def __init__(self):
self.root = Node()
self.nodeLinks = {}
self.firstNodeLink = {}
[docs]
def addTransaction(self, transaction, tid):
"""
add transaction into tree
:param transaction: it represents the one transactions in database
:type transaction: list
:param tid: represents the timestamp of transaction
:type tid: list
"""
current = self.root
for item in transaction:
child = current.getChild(item)
if not child:
newNode = Node()
newNode.item = item
newNode.parent = current
current.child.append(newNode)
current = newNode
self.fixNodeLinks(item, newNode)
else:
current = child
current.tidList.add(tid)
[docs]
def fixNodeLinks(self, item, newNode):
"""
fix node link
:param item: it represents item name of newNode
:type item: string
:param newNode: it represents node which is added
:type newNode: Node
"""
if item in self.nodeLinks:
lastNode = self.nodeLinks[item]
lastNode.nodeLink = newNode
self.nodeLinks[item] = newNode
if item not in self.firstNodeLink:
self.firstNodeLink[item] = newNode
[docs]
def deleteNode(self, item):
"""
delete the node from tree
:param item: it represents the item name of node
:type item: str
"""
deleteNode = self.firstNodeLink[item]
parentNode = deleteNode.parent
parentNode.child.remove(deleteNode)
parentNode.child += deleteNode.child
parentNode.tidList |= deleteNode.tidList
for child in deleteNode.child:
child.parent = parentNode
while deleteNode.nodeLink:
deleteNode = deleteNode.nodeLink
parentNode = deleteNode.parent
parentNode.child.remove(deleteNode)
parentNode.child += deleteNode.child
parentNode.tidList |= deleteNode.tidList
for child in deleteNode.child:
child.parent = parentNode
[docs]
def createPrefixTree(self, path, tidList):
"""
create prefix tree by path
:param path: it represents path to root from prefix node
:type path: list
:param tidList: it represents tid of each item
:type tidList: list
"""
currentNode = self.root
for item in path:
child = currentNode.getChild(item)
if not child:
newNode = Node()
newNode.item = item
newNode.parent = currentNode
currentNode.child.append(newNode)
currentNode = newNode
self.fixNodeLinks(item, newNode)
else:
currentNode = child
currentNode.tidList |= tidList
[docs]
def createConditionalTree(self, PFList, minSup, maxPer, minPR, last):
"""
create conditional tree by PFlist
:param PFList: it represents timestamp each item
:type PFList: dict
:param minSup: it represents minSup
:param maxPer: it represents maxPer
:param minPR: it represents minPR
:param last: it represents last timestamp in database
:return: return is PFlist which satisfy ip / (minSup+1) >= minPR
"""
keys = list(PFList)
for item in keys:
ip = calculateIP(maxPer, PFList[item], last).run()
if ip / (minSup+1) >= minPR:
continue
else:
self.deleteNode(item)
del PFList[item]
return PFList
[docs]
class calculateIP:
"""
This class calculate ip from timestamp
:Attributes:
maxPer : float
it represents user defined maxPer value
timeStamp : list
it represents timestamp of item
timeStampFinal : int
it represents last timestamp of database
:Methods:
run
calculate ip from its timestamp list
"""
def __init__(self, maxPer, timeStamp, timeStampFinal):
self.maxPer = maxPer
self.timeStamp = timeStamp
self.timeStampFinal = timeStampFinal
[docs]
def run(self):
"""
calculate ip from timeStamp list
:return: it represents ip value
"""
ip = 0
self.timeStamp = sorted(list(self.timeStamp))
if len(self.timeStamp) <= 0:
return ip
if self.timeStamp[0] - 0 <= self.maxPer:
ip += 1
for i in range(len(self.timeStamp)-1):
if (self.timeStamp[i+1] - self.timeStamp[i]) <= self.maxPer:
ip += 1
if abs(self.timeStamp[(len(self.timeStamp)-1)] - self.timeStampFinal) <= self.maxPer:
ip += 1
return ip
[docs]
class generatePFListver2:
"""
generate time stamp list from input file
:Attributes:
inputFile : str
it is input file name
minSup : float
user defined minimum support value
maxPer : float
user defined max Periodicity value
minPR : float
user defined min PR value
PFList : dict
storing timestamps each item
"""
def __init__(self, Database, minSup, maxPer, minPR):
self.Database = Database
self.minSup = minSup
self.maxPer = maxPer
self.minPR = minPR
self.PFList = {}
self.tsList = {}
[docs]
def run(self):
"""
generate PFlist
:return: timestamps and last timestamp
"""
global orderOfItem
tidList = {}
tid = 1
for transaction in self.Database:
timestamp = int(transaction[0])
for item in transaction[1:]:
if item not in self.PFList:
self.tsList[item] = {}
self.tsList[item][tid] = timestamp
ip = 0
if timestamp <= self.maxPer:
ip = 1
self.PFList[item] = [1, ip, timestamp]
else:
self.tsList[item][tid] = timestamp
if timestamp - self.PFList[item][2] <= self.maxPer:
self.PFList[item][1] += 1
self.PFList[item][0] += 1
self.PFList[item][2] = timestamp
tid += 1
last = timestamp
for item in self.PFList:
if last - self.PFList[item][2] <= self.maxPer:
self.PFList[item][1] += 1
if self.PFList[item][1] / (self.minSup + 1) < self.minPR or self.PFList[item][0] < self.minSup:
del self.tsList[item]
#self.PFList = {tuple([k]): v for k, v in sorted(self.PFList.items(), key=lambda x:x[1], reverse=True)}
tidList = {tuple([k]): v for k, v in sorted(self.tsList.items(), key=lambda x:len(x[1]), reverse=True)}
orderOfItem = tidList.copy()
return tidList, last
[docs]
def findSeparator(self, line):
"""
find separator of line in database
:param line: it represents one line in database
:type line: list
:return: return separator
"""
separator = ['\t', ',', '*', '&', ' ', '%', '$', '#', '@', '!', ' ', '*', '(', ')']
for i in separator:
if i in line:
return i
return None
[docs]
class generatePFTreever2:
"""
create tree from tidList and input file
:Attributes:
inputFile : str
it represents input file name
tidList : dict
storing tids each item
root : Node
it represents the root node of the tree
:Methods:
run
it create tree
find separator(line)
find separator in the line of database
"""
def __init__(self, Database, tidList):
self.Database = Database
self.tidList = tidList
self.root = Tree()
[docs]
def run(self):
"""
create tree from database and tidList
:return: the root node of tree
"""
tid = 1
for transaction in self.Database:
timestamp = int(transaction[0])
transaction = [tuple([item]) for item in transaction[1:] if tuple([item]) in self.tidList]
transaction = sorted(transaction, key=lambda x: len(self.tidList[x]), reverse=True)
self.root.addTransaction(transaction, timestamp)
tid += 1
return self.root
[docs]
def findSeparator(self, line):
"""
find separator of line in database
:param line: it represents one line in database
:type line: list
:return: return separator
"""
separator = ['\t', ',', '*', '&', ' ', '%', '$', '#', '@', '!', ' ', '*', '(', ')']
for i in separator:
if i in line:
return i
return None
[docs]
class PFgrowth:
"""
This class is pattern growth algorithm
:Attributes:
tree : Node
represents the root node of prefix tree
prefix : list
prefix is list of prefix items
PFList : dict
storing time stamp each item
minSup : float
user defined min Support
maxPer : float
user defined max Periodicity
minPR : float
user defined min PR
last : int
represents last time stamp in database
:Methods:
run
it is pattern growth algorithm
"""
def __init__(self, tree, prefix, PFList, minSup, maxPer, minPR, last):
self.tree = tree
self.prefix = prefix
self.PFList = PFList
self.minSup = minSup
self.maxPer = maxPer
self.minPR = minPR
self.last = last
[docs]
def run(self):
"""
run the pattern growth algorithm
:return: partial periodic frequent pattern in conditional pattern
"""
global orderOfItem
result = {}
items = self.PFList
if not self.prefix:
items = reversed(self.PFList)
for item in items:
prefix = self.prefix.copy()
prefix.append(item)
PFList = {}
prefixTree = Tree()
prefixNode = self.tree.firstNodeLink[item]
tidList = prefixNode.tidList
path = []
currentNode = prefixNode.parent
while currentNode.item != -1:
path.insert(0, currentNode.item)
currentNodeItem = currentNode.item
if currentNodeItem in PFList:
PFList[currentNodeItem] |= tidList
else:
PFList[currentNodeItem] = tidList
currentNode = currentNode.parent
prefixTree.createPrefixTree(path, tidList)
while prefixNode.nodeLink:
prefixNode = prefixNode.nodeLink
tidList = prefixNode.tidList
path = []
currentNode = prefixNode.parent
while currentNode.item != -1:
path.insert(0, currentNode.item)
currentNodeItem = currentNode.item
if currentNodeItem in PFList:
PFList[currentNodeItem] = PFList[currentNodeItem] | tidList
else:
PFList[currentNodeItem] = tidList
currentNode = currentNode.parent
prefixTree.createPrefixTree(path, tidList)
ip = calculateIP(self.maxPer, self.PFList[item], self.last).run()
s = len(self.PFList[item])
if ip / (s+1) >= self.minPR and s >= self.minSup:
result[tuple(prefix)] = [s, ip / (s+1)]
if PFList:
PFList = {k: v for k, v in sorted(PFList.items(), key=lambda x:len(orderOfItem[x[0]]), reverse=True)}
PFList = prefixTree.createConditionalTree(PFList, self.minSup, self.maxPer, self.minPR, self.last)
if PFList:
#self.PFList = {tuple([k]): v for k, v in sorted(self.PFList.items(), key=lambda x: x[1], reverse=True)}
obj = PFgrowth(prefixTree, prefix, PFList, self.minSup, self.maxPer, self.minPR, self.last)
result1 = obj.run()
result = {**result, **result1}
return result
[docs]
class GPFgrowth(partialPeriodicPatterns):
"""
:Description: GPFgrowth is algorithm to mine the partial periodic frequent pattern in temporal database.
:Reference: R. Uday Kiran, J.N. Venkatesh, Masashi Toyoda, Masaru Kitsuregawa, P. Krishna Reddy, Discovering partial periodic-frequent patterns in a transactional database,
Journal of Systems and Software, Volume 125, 2017, Pages 170-182, ISSN 0164-1212, https://doi.org/10.1016/j.jss.2016.11.035.
:param iFile: str :
Name of the Input file to mine complete set of frequent pattern's
:param oFile: str :
Name of the output file to store complete set of frequent patterns
:param minSup: str:
The user can specify minSup either in count or proportion of database size.
:param minPR: str:
Controls the maximum number of transactions in which any two items within a pattern can reappear.
:param maxPer: str:
Controls the maximum number of transactions in which any two items within a pattern can reappear.
:param sep: str :
This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator.
:Attributes:
inputFile : file
Name of the input file to mine complete set of frequent pattern
minSup : float
The user defined minSup
maxPer : float
The user defined maxPer
minPR : float
The user defined minPR
finalPatterns : dict
it represents to store the pattern
runTime : float
storing the total runtime of the mining process
memoryUSS : float
storing the total amount of USS memory consumed by the program
memoryRSS : float
storing the total amount of RSS memory consumed by the program
:Methods:
startMine()
Mining process will start from here
getPatterns()
Complete set of patterns will be retrieved with this function
storePatternsInFile(ouputFile)
Complete set of frequent patterns will be loaded in to an output file
getPatternsAsDataFrame()
Complete set of frequent patterns will be loaded in to an output file
getMemoryUSS()
Total amount of USS memory consumed by the mining process will be retrieved from this function
getMemoryRSS()
Total amount of RSS memory consumed by the mining process will be retrieved from this function
getRuntime()
Total amount of runtime taken by the mining process will be retrieved from this function
**Executing code on Terminal:**
----------------------------------
Format:
>>> python3 GPFgrowth.py <inputFile> <outputFile> <minSup> <maxPer> <minPR>
Examples:
>>> python3 GPFgrowth.py sampleDB.txt patterns.txt 10 10 0.5
**Sample run of the importing code:**
--------------------------------------
... code-block:: python
from PAMI.partialPeriodicFrequentPattern.basic import GPFgrowth as alg
obj = alg.GPFgrowth(inputFile, outputFile, minSup, maxPer, minPR)
obj.startMine()
partialPeriodicFrequentPatterns = obj.getPatterns()
print("Total number of partial periodic Patterns:", len(partialPeriodicFrequentPatterns))
obj.save(oFile)
Df = obj.getPatternInDf()
memUSS = obj.getMemoryUSS()
print("Total Memory in USS:", memUSS)
memRSS = obj.getMemoryRSS()
print("Total Memory in RSS", memRSS)
run = obj.getRuntime()
print("Total ExecutionTime in seconds:", run)
**Credits:**
--------------
The complete program was written by Nakamura under the supervision of Professor Rage Uday Kiran.
"""
_partialPeriodicPatterns__iFile = ' '
_partialPeriodicPatterns__oFile = ' '
_partialPeriodicPatterns__sep = str()
_partialPeriodicPatterns__startTime = float()
_partialPeriodicPatterns__endTime = float()
_partialPeriodicPatterns__minSup = str()
_partialPeriodicPatterns__maxPer = str()
_partialPeriodicPatterns__minPR = str()
_partialPeriodicPatterns__finalPatterns = {}
runTime = 0
_partialPeriodicPatterns__memoryUSS = float()
_partialPeriodicPatterns__memoryRSS = float()
__Database = []
def __convert(self, value):
"""
to convert the type of user specified minSup value
:param value: user specified minSup value
:return: converted type
"""
if type(value) is int:
value = int(value)
if type(value) is float:
value = (len(self.__Database) * value)
if type(value) is str:
if '.' in value:
value = float(value)
value = (len(self.__Database) * value)
else:
value = int(value)
return value
def __readDatabase(self):
self.__Database = []
if isinstance(self.__inputFile, pd.DataFrame):
if self.__inputFile.empty:
print("its empty..")
i = self.__inputFile.columns.values.tolist()
if 'Transactions' in i:
self.__Database = self.__inputFile['Transactions'].tolist()
if 'Patterns' in i:
self.__Database = self.__inputFile['Patterns'].tolist()
if isinstance(self.__inputFile, str):
if validators.url(self.__inputFile):
data = urlopen(self.__inputFile)
for line in data:
line.strip()
line = line.decode("utf-8")
temp = [i.rstrip() for i in line.split(self._partialPeriodicPatterns__sep)]
temp = [x for x in temp if x]
self.__Database.append(temp)
else:
try:
with open(self.__inputFile, 'r', encoding='utf-8') as f:
for line in f:
line.strip()
temp = [i.rstrip() for i in line.split(self._partialPeriodicPatterns__sep)]
temp = [x for x in temp if x]
self.__Database.append(temp)
except IOError:
print("File Not Found")
quit()
[docs]
def startMine(self):
self.__inputFile = self._partialPeriodicPatterns__iFile
self._partialPeriodicPatterns__startTime = time.time()
self._partialPeriodicPatterns__finalPatterns = {}
self.__readDatabase()
self._partialPeriodicPatterns__minSup = self.__convert(self._partialPeriodicPatterns__minSup)
self._partialPeriodicPatterns__maxPer = self.__convert(self._partialPeriodicPatterns__maxPer)
# self.minPR = self.convert(self.minPR)
self._partialPeriodicPatterns__finalPatterns = {}
obj = generatePFListver2(self.__Database, self._partialPeriodicPatterns__minSup, self._partialPeriodicPatterns__maxPer, self._partialPeriodicPatterns__minPR)
tidList, last = obj.run()
PFTree = generatePFTreever2(self.__Database, tidList).run()
obj2 = PFgrowth(PFTree, [], tidList, self._partialPeriodicPatterns__minSup, self._partialPeriodicPatterns__maxPer, self._partialPeriodicPatterns__minPR, last)
self._partialPeriodicPatterns__finalPatterns = obj2.run()
self._partialPeriodicPatterns__endTime = time.time()
self.__runTime = self._partialPeriodicPatterns__endTime - self._partialPeriodicPatterns__startTime
process = psutil.Process(os.getpid())
self._partialPeriodicPatterns__memoryUSS = float()
self._partialPeriodicPatterns__memoryRSS = float()
self._partialPeriodicPatterns__memoryUSS = process.memory_full_info().uss
self._partialPeriodicPatterns__memoryRSS = process.memory_info().rss
[docs]
def getMemoryUSS(self):
"""Total amount of USS memory consumed by the mining process will be retrieved from this function
:return: returning USS memory consumed by the mining process
:rtype: float
"""
return self._partialPeriodicPatterns__memoryUSS
[docs]
def getRuntime(self):
"""Calculating the total amount of runtime taken by the mining process
:return: returning total amount of runtime taken by the mining process
:rtype: float
"""
return self.__runTime
[docs]
def getPatternsAsDataFrame(self):
"""Storing final frequent patterns in a dataframe
:return: returning frequent patterns in a dataframe
:rtype: pd.DataFrame
"""
dataframe = {}
data = []
for a, b in self._partialPeriodicPatterns__finalPatterns.items():
if len(a) == 1:
pattern = f'{a[0][0]}'
else:
pattern = f'{a[0][0]}'
for item in a[1:]:
pattern = pattern + f' {item[0]}'
#print(pattern)
data.append([pattern, b[0], b[1]])
dataframe = pd.DataFrame(data, columns=['Patterns', 'Support', 'PeriodicRatio'])
return dataframe
[docs]
def save(self, outFile):
"""Complete set of frequent patterns will be loaded in to an output file
:param outFile: name of the output file
:type outFile: csv file
"""
self.oFile = outFile
writer = open(self.oFile, 'w+')
for x, y in self._partialPeriodicPatterns__finalPatterns.items():
if len(x) == 1:
writer.write(f'{x[0][0]}:{y[0]}:{y[1]}\n')
else:
writer.write(f'{x[0][0]}')
for item in x[1:]:
writer.write(f'\t{item[0]}')
writer.write(f':{y[0]}:{y[1]}\n')\
[docs]
def getPatterns(self):
""" Function to send the set of frequent patterns after completion of the mining process
:return: returning frequent patterns
:rtype: dict
"""
return self._partialPeriodicPatterns__finalPatterns
[docs]
def printResults(self):
""" this function is used to print the results
"""
print("Total number of Partial Periodic Frequent Patterns:", len(self.getPatterns()))
print("Total Memory in USS:", self.getMemoryUSS())
print("Total Memory in RSS", self.getMemoryRSS())
print("Total ExecutionTime in ms:", self.getRuntime())
if __name__ == '__main__':
ap = str()
if len(sys.argv) == 6 or len(sys.argv) == 7:
if len(sys.argv) == 7:
ap = GPFgrowth(sys.argv[1], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6])
if len(sys.argv) == 6:
ap = GPFgrowth(sys.argv[1], sys.argv[3], sys.argv[4], sys.argv[5])
ap.startMine()
print("Total number of Frequent Patterns:", len(ap.getPatterns()))
ap.save(sys.argv[2])
print("Total Memory in USS:", ap.getMemoryUSS())
print("Total Memory in RSS", ap.getMemoryRSS())
print("Total ExecutionTime in ms:", ap.getRuntime())
else:
for i in [1000, 2000, 3000, 4000, 5000]:
_ap = GPFgrowth('/Users/Likhitha/Downloads/temporal_T10I4D100K.csv', i, 500, 0.7, '\t')
_ap.startMine()
print("Total number of Maximal Partial Periodic Patterns:", len(_ap.getPatterns()))
_ap.save('/Users/Likhitha/Downloads/output.txt')
print("Total Memory in USS:", _ap.getMemoryUSS())
print("Total Memory in RSS", _ap.getMemoryRSS())
print("Total ExecutionTime in ms:", _ap.getRuntime())
print("Error! The number of input parameters do not match the total number of parameters provided")