# Top - K is and algorithm to discover top frequent patterns in a transactional database.
#
# **Importing this algorithm into a python program**
#
# import PAMI.frequentPattern.topK.FAE as alg
#
# iFile = 'sampleDB.txt'
#
# K = 2
#
# obj = alg.FAE(iFile, K)
#
# obj.mine()
#
# topKFrequentPatterns = obj.getPatterns()
#
# print("Total number of Frequent Patterns:", len(topKFrequentPatterns))
#
# obj.save(oFile)
#
# Df = obj.getPatternInDataFrame()
#
# memUSS = obj.getMemoryUSS()
#
# print("Total Memory in USS:", memUSS)
#
# memRSS = obj.getMemoryRSS()
#
# print("Total Memory in RSS", memRSS)
#
# run = obj.getRuntime()
#
# print("Total ExecutionTime in seconds:", run)
#
__copyright__ = """
Copyright (C) 2021 Rage Uday Kiran
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from PAMI.frequentPattern.topk import abstract as _ab
from deprecated import deprecated
[docs]
class FAE(_ab._frequentPatterns):
"""
**About this algorithm**
:**Description**: Top - K is and algorithm to discover top frequent patterns in a transactional database.
:**Reference**: Zhi-Hong Deng, Guo-Dong Fang: Mining Top-Rank-K Frequent Patterns: DOI: 10.1109/ICMLC.2007.4370261 ยท Source: IEEE Xplore https://ieeexplore.ieee.org/document/4370261
:**Parameters**: - **iFile** (*str or URL or dataFrame*) -- *Name of the Input file to mine complete set of frequent patterns.*
- **oFile** (*str*) -- *Name of the output file to store complete set of frequent patterns.*
- **k** (*int*) -- *User specified count of top frequent patterns.*
- **minimum** (*int*) -- *Minimum number of frequent patterns to consider in analysis.*
- **sep** (*str*) -- *This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator.*
:**Attributes**: - **startTime** (*float*) -- *To record the start time of the mining process.*
- **endTime** (*float*) -- *To record the completion time of the mining process.*
- **finalPatterns** (*dict*) -- *Storing the complete set of patterns in a dictionary variable.*
- **memoryUSS** (*float*) -- *To store the total amount of USS memory consumed by the program.*
- **memoryRSS** (*float*) -- *To store the total amount of RSS memory consumed by the program.*
**Execution methods**
**Terminal command**
.. code-block:: console
Format:
(.venv) $ python3 FAE.py <inputFile> <outputFile> <K>
Example Usage:
(.venv) $ python3 FAE.py sampleDB.txt patterns.txt 10.0
.. note:: k will be considered as count of top frequent patterns to consider in analysis.
**Calling from a python program**
.. code-block:: python
import PAMI.frequentPattern.topK.FAE as alg
iFile = 'sampleDB.txt'
K = 2
obj = alg.FAE(iFile, K)
obj.mine()
topKFrequentPatterns = obj.getPatterns()
print("Total number of Frequent Patterns:", len(topKFrequentPatterns))
obj.save(oFile)
Df = obj.getPatternInDataFrame()
memUSS = obj.getMemoryUSS()
print("Total Memory in USS:", memUSS)
memRSS = obj.getMemoryRSS()
print("Total Memory in RSS", memRSS)
run = obj.getRuntime()
print("Total ExecutionTime in seconds:", run)
**Credits:**
The complete program was written by P. Likhitha and revised by Tarun Sreepada under the supervision of Professor Rage Uday Kiran.
"""
_startTime = float()
_endTime = float()
_k = int()
_finalPatterns = {}
_iFile = " "
_oFile = " "
_sep = " "
_memoryUSS = float()
_memoryRSS = float()
_Database = []
_tidList = {}
_minimum = int()
def _creatingItemSets(self):
"""
Storing the complete transactions of the database/input file in a database variable
"""
self._Database = []
if isinstance(self._iFile, _ab._pd.DataFrame):
if self._iFile.empty:
print("its empty..")
i = self._iFile.columns.values.tolist()
if 'Transactions' in i:
self._Database = self._iFile['Transactions'].tolist()
self._Database = [x.split(self._sep) for x in self._Database]
else:
print("The column name should be Transactions and each line should be separated by tab space or a seperator specified by the user")
# print(self.Database)
if isinstance(self._iFile, str):
if _ab._validators.url(self._iFile):
data = _ab._urlopen(self._iFile)
for line in data:
line.strip()
line = line.decode("utf-8")
temp = [i.rstrip() for i in line.split(self._sep)]
temp = [x for x in temp if x]
self._Database.append(temp)
else:
try:
with open(self._iFile, 'r', encoding='utf-8') as f:
for line in f:
line.strip()
temp = [i.rstrip() for i in line.split(self._sep)]
temp = [x for x in temp if x]
self._Database.append(temp)
except IOError:
print("File Not Found")
quit()
def _frequentOneItem(self):
"""
Generating one frequent patterns
"""
candidate = {}
self._tidList = {}
for i in range(len(self._Database)):
for j in self._Database[i]:
if j not in candidate:
candidate[j] = 1
self._tidList[j] = [i]
else:
candidate[j] += 1
self._tidList[j].append(i)
self._finalPatterns = {}
plist = [key for key, value in sorted(candidate.items(), key=lambda x: x[1], reverse=True)]
self._tidList = {k: frozenset(v) for k, v in self._tidList.items()}
for i in plist:
if len(self._finalPatterns) >= self._k:
break
else:
self._finalPatterns[i] = candidate[i]
self._minimum = min([self._finalPatterns[i] for i in self._finalPatterns.keys()])
plist = list(self._finalPatterns.keys())
return plist
def _save(self, prefix, suffix, tidSetI):
"""
Saves the patterns that satisfy the periodic frequent property.
:param prefix: the prefix of a pattern
:type prefix: list
:param suffix: the suffix of a patterns
:type suffix: list
:param tidSetI: the timestamp of a patterns
:type tidSetI: list
"""
if prefix is None:
prefix = suffix
else:
prefix = prefix + suffix
val = len(tidSetI)
#sample = str()
# for i in prefix:
# sample = sample + i + "\t"
sample = "\t".join(prefix)
if len(self._finalPatterns) < self._k:
if val > self._minimum:
self._finalPatterns[sample] = val
self._finalPatterns = {k: v for k, v in sorted(self._finalPatterns.items(), key=lambda item: item[1], reverse=True)}
self._minimum = min([i for i in self._finalPatterns.values()])
else:
for x, y in sorted(self._finalPatterns.items(), key=lambda x_: x_[1]):
if val > y:
del self._finalPatterns[x]
self._finalPatterns[sample] = val
self._finalPatterns = {k: v for k, v in
sorted(self._finalPatterns.items(), key=lambda item: item[1],
reverse=True)}
self._minimum = min([i for i in self._finalPatterns.values()])
return
def _Generation(self, prefix, itemSets, tidSets):
"""
Equivalence class is followed and checks for the patterns generated for periodic-frequent patterns.
:param prefix: main equivalence prefix
:type prefix: periodic-frequent item or pattern
:param itemSets: patterns which are items combined with prefix and satisfying the periodicity and frequent with their timestamps
:type itemSets: list
:param tidSets: timestamps of the items in the argument itemSets
:type tidSets: list
"""
if len(itemSets) == 1:
i = itemSets[0]
tidI = tidSets[0]
self._save(prefix, [i], tidI)
return
for i in range(len(itemSets)):
itemI = itemSets[i]
if itemI is None:
continue
tidSetI = tidSets[i]
classItemSets = []
classTidSets = []
itemSetX = [itemI]
for j in range(i + 1, len(itemSets)):
itemJ = itemSets[j]
tidSetJ = tidSets[j]
y = tidSetI.intersection(tidSetJ)
if len(y) >= self._minimum:
classItemSets.append(itemJ)
classTidSets.append(y)
newPrefix = list(set(itemSetX)) + prefix
self._Generation(newPrefix, classItemSets, classTidSets)
self._save(prefix, list(set(itemSetX)), tidSetI)
def _convert(self, value):
"""
to convert the type of user specified minSup value
:param value: user specified minSup value
:type value: int or float or str
:return: converted type
"""
if type(value) is int:
value = int(value)
if type(value) is float:
value = (len(self._Database) * value)
if type(value) is str:
if '.' in value:
value = float(value)
value = ((len(self._Database)) * value)
else:
value = int(value)
return value
[docs]
@deprecated("It is recommended to use 'mine()' instead of 'mine()' for mining process. Starting from January 2025, 'mine()' will be completely terminated.")
def startMine(self):
"""
TopK Frequent pattern mining process will start from here
"""
self.mine()
[docs]
def mine(self):
"""
TopK Frequent pattern mining process will start from here
"""
self._startTime = _ab._time.time()
if self._iFile is None:
raise Exception("Please enter the file path or file name:")
if self._k is None:
raise Exception("Please enter the Minimum Support")
self._creatingItemSets()
self._k = self._convert(self._k)
plist = self._frequentOneItem()
for i in range(len(plist)):
itemI = plist[i]
tidSetI = self._tidList[itemI]
itemSetX = [itemI]
itemSets = []
tidSets = []
for j in range(i + 1, len(plist)):
itemJ = plist[j]
tidSetJ = self._tidList[itemJ]
y1 = tidSetI.intersection(tidSetJ)
if len(y1) >= self._minimum:
itemSets.append(itemJ)
tidSets.append(y1)
self._Generation(itemSetX, itemSets, tidSets)
print(" TopK frequent patterns were successfully generated using FAE algorithm.")
self._endTime = _ab._time.time()
self._memoryUSS = float()
self._memoryRSS = float()
process = _ab._psutil.Process(_ab._os.getpid())
self._memoryUSS = process.memory_full_info().uss
self._memoryRSS = process.memory_info().rss
[docs]
def getMemoryUSS(self):
"""
Total amount of USS memory consumed by the mining process will be retrieved from this function
:return: returning USS memory consumed by the mining process
:rtype: float
"""
return self._memoryUSS
[docs]
def getRuntime(self):
"""
Calculating the total amount of runtime taken by the mining process
:return: returning total amount of runtime taken by the mining process
:rtype: float
"""
return self._endTime - self._startTime
[docs]
def getPatternsAsDataFrame(self):
"""
Storing final frequent patterns in a dataframe
:return: returning frequent patterns in a dataframe
:rtype: pd.DataFrame
"""
# dataframe = {}
# data = []
# for a, b in self._finalPatterns.items():
# data.append([a.replace('\t', ' '), b])
# dataframe = _ab._pd.DataFrame(data, columns=['Patterns', 'Support'])
dataframe = _ab._pd.DataFrame(list([[x.replace('\t', ' '), y] for x,y in self._finalPatterns.items()]), columns=['Patterns', 'Support'])
return dataframe
[docs]
def save(self, outFile):
"""
Complete set of frequent patterns will be loaded in to an output file
:param outFile: name of the output file
:type outFile: csvfile
"""
self._oFile = outFile
writer = open(self._oFile, 'w+')
for x, y in self._finalPatterns.items():
s1 = x.strip() + ":" + str(y)
writer.write("%s \n" % s1)
[docs]
def getPatterns(self):
"""
Function to send the set of frequent patterns after completion of the mining process
:return: returning frequent patterns
:rtype: dict
"""
return self._finalPatterns
[docs]
def printResults(self):
"""
This function is used to print the results
"""
print("Top K Frequent Patterns:", len(self.getPatterns()))
print("Total Memory in USS:", self.getMemoryUSS())
print("Total Memory in RSS", self.getMemoryRSS())
print("Total ExecutionTime in ms:", self.getRuntime())
if __name__ == "__main__":
_ap = str()
if len(_ab._sys.argv) == 4 or len(_ab._sys.argv) == 5:
if len(_ab._sys.argv) == 5:
_ap = FAE(_ab._sys.argv[1], _ab._sys.argv[3], _ab._sys.argv[4])
if len(_ab._sys.argv) == 4:
_ap = FAE(_ab._sys.argv[1], _ab._sys.argv[3])
_ap.mine()
_ap.mine()
print("Top K Frequent Patterns:", len(_ap.getPatterns()))
_ap.save(_ab._sys.argv[2])
print("Total Memory in USS:", _ap.getMemoryUSS())
print("Total Memory in RSS", _ap.getMemoryRSS())
print("Total ExecutionTime in ms:", _ap.getRuntime())
else:
print("Error! The number of input parameters do not match the total number of parameters provided")