Source code for PAMI.frequentPattern.topk.FAE

# Top - K is and algorithm to discover top frequent patterns in a transactional database.
#
# **Importing this algorithm into a python program**
#
#             import PAMI.frequentPattern.topK.FAE as alg
#
#             iFile = 'sampleDB.txt'
#
#             K = 2
#
#             obj = alg.FAE(iFile, K)
#
#             obj.mine()
#
#             topKFrequentPatterns = obj.getPatterns()
#
#             print("Total number of Frequent Patterns:", len(topKFrequentPatterns))
#
#             obj.save(oFile)
#
#             Df = obj.getPatternInDataFrame()
#
#             memUSS = obj.getMemoryUSS()
#
#             print("Total Memory in USS:", memUSS)
#
#             memRSS = obj.getMemoryRSS()
#
#             print("Total Memory in RSS", memRSS)
#
#             run = obj.getRuntime()
#
#             print("Total ExecutionTime in seconds:", run)
#


__copyright__ = """
Copyright (C)  2021 Rage Uday Kiran

     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.

     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

from PAMI.frequentPattern.topk import abstract as _ab
from deprecated import deprecated


[docs] class FAE(_ab._frequentPatterns): """ **About this algorithm** :**Description**: Top - K is and algorithm to discover top frequent patterns in a transactional database. :**Reference**: Zhi-Hong Deng, Guo-Dong Fang: Mining Top-Rank-K Frequent Patterns: DOI: 10.1109/ICMLC.2007.4370261 ยท Source: IEEE Xplore https://ieeexplore.ieee.org/document/4370261 :**Parameters**: - **iFile** (*str or URL or dataFrame*) -- *Name of the Input file to mine complete set of frequent patterns.* - **oFile** (*str*) -- *Name of the output file to store complete set of frequent patterns.* - **k** (*int*) -- *User specified count of top frequent patterns.* - **minimum** (*int*) -- *Minimum number of frequent patterns to consider in analysis.* - **sep** (*str*) -- *This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator.* :**Attributes**: - **startTime** (*float*) -- *To record the start time of the mining process.* - **endTime** (*float*) -- *To record the completion time of the mining process.* - **finalPatterns** (*dict*) -- *Storing the complete set of patterns in a dictionary variable.* - **memoryUSS** (*float*) -- *To store the total amount of USS memory consumed by the program.* - **memoryRSS** (*float*) -- *To store the total amount of RSS memory consumed by the program.* **Execution methods** **Terminal command** .. code-block:: console Format: (.venv) $ python3 FAE.py <inputFile> <outputFile> <K> Example Usage: (.venv) $ python3 FAE.py sampleDB.txt patterns.txt 10.0 .. note:: k will be considered as count of top frequent patterns to consider in analysis. **Calling from a python program** .. code-block:: python import PAMI.frequentPattern.topK.FAE as alg iFile = 'sampleDB.txt' K = 2 obj = alg.FAE(iFile, K) obj.mine() topKFrequentPatterns = obj.getPatterns() print("Total number of Frequent Patterns:", len(topKFrequentPatterns)) obj.save(oFile) Df = obj.getPatternInDataFrame() memUSS = obj.getMemoryUSS() print("Total Memory in USS:", memUSS) memRSS = obj.getMemoryRSS() print("Total Memory in RSS", memRSS) run = obj.getRuntime() print("Total ExecutionTime in seconds:", run) **Credits:** The complete program was written by P. Likhitha and revised by Tarun Sreepada under the supervision of Professor Rage Uday Kiran. """ _startTime = float() _endTime = float() _k = int() _finalPatterns = {} _iFile = " " _oFile = " " _sep = " " _memoryUSS = float() _memoryRSS = float() _Database = [] _tidList = {} _minimum = int() def _creatingItemSets(self): """ Storing the complete transactions of the database/input file in a database variable """ self._Database = [] if isinstance(self._iFile, _ab._pd.DataFrame): if self._iFile.empty: print("its empty..") i = self._iFile.columns.values.tolist() if 'Transactions' in i: self._Database = self._iFile['Transactions'].tolist() self._Database = [x.split(self._sep) for x in self._Database] else: print("The column name should be Transactions and each line should be separated by tab space or a seperator specified by the user") # print(self.Database) if isinstance(self._iFile, str): if _ab._validators.url(self._iFile): data = _ab._urlopen(self._iFile) for line in data: line.strip() line = line.decode("utf-8") temp = [i.rstrip() for i in line.split(self._sep)] temp = [x for x in temp if x] self._Database.append(temp) else: try: with open(self._iFile, 'r', encoding='utf-8') as f: for line in f: line.strip() temp = [i.rstrip() for i in line.split(self._sep)] temp = [x for x in temp if x] self._Database.append(temp) except IOError: print("File Not Found") quit() def _frequentOneItem(self): """ Generating one frequent patterns """ candidate = {} self._tidList = {} for i in range(len(self._Database)): for j in self._Database[i]: if j not in candidate: candidate[j] = 1 self._tidList[j] = [i] else: candidate[j] += 1 self._tidList[j].append(i) self._finalPatterns = {} plist = [key for key, value in sorted(candidate.items(), key=lambda x: x[1], reverse=True)] self._tidList = {k: frozenset(v) for k, v in self._tidList.items()} for i in plist: if len(self._finalPatterns) >= self._k: break else: self._finalPatterns[i] = candidate[i] self._minimum = min([self._finalPatterns[i] for i in self._finalPatterns.keys()]) plist = list(self._finalPatterns.keys()) return plist def _save(self, prefix, suffix, tidSetI): """ Saves the patterns that satisfy the periodic frequent property. :param prefix: the prefix of a pattern :type prefix: list :param suffix: the suffix of a patterns :type suffix: list :param tidSetI: the timestamp of a patterns :type tidSetI: list """ if prefix is None: prefix = suffix else: prefix = prefix + suffix val = len(tidSetI) #sample = str() # for i in prefix: # sample = sample + i + "\t" sample = "\t".join(prefix) if len(self._finalPatterns) < self._k: if val > self._minimum: self._finalPatterns[sample] = val self._finalPatterns = {k: v for k, v in sorted(self._finalPatterns.items(), key=lambda item: item[1], reverse=True)} self._minimum = min([i for i in self._finalPatterns.values()]) else: for x, y in sorted(self._finalPatterns.items(), key=lambda x_: x_[1]): if val > y: del self._finalPatterns[x] self._finalPatterns[sample] = val self._finalPatterns = {k: v for k, v in sorted(self._finalPatterns.items(), key=lambda item: item[1], reverse=True)} self._minimum = min([i for i in self._finalPatterns.values()]) return def _Generation(self, prefix, itemSets, tidSets): """ Equivalence class is followed and checks for the patterns generated for periodic-frequent patterns. :param prefix: main equivalence prefix :type prefix: periodic-frequent item or pattern :param itemSets: patterns which are items combined with prefix and satisfying the periodicity and frequent with their timestamps :type itemSets: list :param tidSets: timestamps of the items in the argument itemSets :type tidSets: list """ if len(itemSets) == 1: i = itemSets[0] tidI = tidSets[0] self._save(prefix, [i], tidI) return for i in range(len(itemSets)): itemI = itemSets[i] if itemI is None: continue tidSetI = tidSets[i] classItemSets = [] classTidSets = [] itemSetX = [itemI] for j in range(i + 1, len(itemSets)): itemJ = itemSets[j] tidSetJ = tidSets[j] y = tidSetI.intersection(tidSetJ) if len(y) >= self._minimum: classItemSets.append(itemJ) classTidSets.append(y) newPrefix = list(set(itemSetX)) + prefix self._Generation(newPrefix, classItemSets, classTidSets) self._save(prefix, list(set(itemSetX)), tidSetI) def _convert(self, value): """ to convert the type of user specified minSup value :param value: user specified minSup value :type value: int or float or str :return: converted type """ if type(value) is int: value = int(value) if type(value) is float: value = (len(self._Database) * value) if type(value) is str: if '.' in value: value = float(value) value = ((len(self._Database)) * value) else: value = int(value) return value
[docs] @deprecated("It is recommended to use 'mine()' instead of 'mine()' for mining process. Starting from January 2025, 'mine()' will be completely terminated.") def startMine(self): """ TopK Frequent pattern mining process will start from here """ self.mine()
[docs] def mine(self): """ TopK Frequent pattern mining process will start from here """ self._startTime = _ab._time.time() if self._iFile is None: raise Exception("Please enter the file path or file name:") if self._k is None: raise Exception("Please enter the Minimum Support") self._creatingItemSets() self._k = self._convert(self._k) plist = self._frequentOneItem() for i in range(len(plist)): itemI = plist[i] tidSetI = self._tidList[itemI] itemSetX = [itemI] itemSets = [] tidSets = [] for j in range(i + 1, len(plist)): itemJ = plist[j] tidSetJ = self._tidList[itemJ] y1 = tidSetI.intersection(tidSetJ) if len(y1) >= self._minimum: itemSets.append(itemJ) tidSets.append(y1) self._Generation(itemSetX, itemSets, tidSets) print(" TopK frequent patterns were successfully generated using FAE algorithm.") self._endTime = _ab._time.time() self._memoryUSS = float() self._memoryRSS = float() process = _ab._psutil.Process(_ab._os.getpid()) self._memoryUSS = process.memory_full_info().uss self._memoryRSS = process.memory_info().rss
[docs] def getMemoryUSS(self): """ Total amount of USS memory consumed by the mining process will be retrieved from this function :return: returning USS memory consumed by the mining process :rtype: float """ return self._memoryUSS
[docs] def getMemoryRSS(self): """ Total amount of RSS memory consumed by the mining process will be retrieved from this function :return: returning RSS memory consumed by the mining process :rtype: float """ return self._memoryRSS
[docs] def getRuntime(self): """ Calculating the total amount of runtime taken by the mining process :return: returning total amount of runtime taken by the mining process :rtype: float """ return self._endTime - self._startTime
[docs] def getPatternsAsDataFrame(self): """ Storing final frequent patterns in a dataframe :return: returning frequent patterns in a dataframe :rtype: pd.DataFrame """ # dataframe = {} # data = [] # for a, b in self._finalPatterns.items(): # data.append([a.replace('\t', ' '), b]) # dataframe = _ab._pd.DataFrame(data, columns=['Patterns', 'Support']) dataframe = _ab._pd.DataFrame(list([[x.replace('\t', ' '), y] for x,y in self._finalPatterns.items()]), columns=['Patterns', 'Support']) return dataframe
[docs] def save(self, outFile): """ Complete set of frequent patterns will be loaded in to an output file :param outFile: name of the output file :type outFile: csvfile """ self._oFile = outFile writer = open(self._oFile, 'w+') for x, y in self._finalPatterns.items(): s1 = x.strip() + ":" + str(y) writer.write("%s \n" % s1)
[docs] def getPatterns(self): """ Function to send the set of frequent patterns after completion of the mining process :return: returning frequent patterns :rtype: dict """ return self._finalPatterns
[docs] def printResults(self): """ This function is used to print the results """ print("Top K Frequent Patterns:", len(self.getPatterns())) print("Total Memory in USS:", self.getMemoryUSS()) print("Total Memory in RSS", self.getMemoryRSS()) print("Total ExecutionTime in ms:", self.getRuntime())
if __name__ == "__main__": _ap = str() if len(_ab._sys.argv) == 4 or len(_ab._sys.argv) == 5: if len(_ab._sys.argv) == 5: _ap = FAE(_ab._sys.argv[1], _ab._sys.argv[3], _ab._sys.argv[4]) if len(_ab._sys.argv) == 4: _ap = FAE(_ab._sys.argv[1], _ab._sys.argv[3]) _ap.mine() _ap.mine() print("Top K Frequent Patterns:", len(_ap.getPatterns())) _ap.save(_ab._sys.argv[2]) print("Total Memory in USS:", _ap.getMemoryUSS()) print("Total Memory in RSS", _ap.getMemoryRSS()) print("Total ExecutionTime in ms:", _ap.getRuntime()) else: print("Error! The number of input parameters do not match the total number of parameters provided")