Source code for PAMI.multipleMinimumSupportBasedFrequentPattern.basic.CFPGrowthPlus

# This code implements the CFPGrowthPlus algorithm for mining frequent patterns with multiple minimum support thresholds from a transactional dataset.
#
# **Importing this algorithm into a python program**
#
#             from PAMI.multipleMinimumSupportBasedFrequentPattern.basic import CFPGrowthPlus as alg
#
#             iFile = "sample.txt"
#
#             MIS = "MIS.txt"
#
#             obj = alg.CFPGrowthPlus(iFile, MIS, sep)
#
#             obj.mine()
#
#             frequentPatterns = obj.getPatterns()
#
#             print("Total number of Frequent Patterns:", len(frequentPatterns))
#
#             obj.savePatterns(oFile)
#
#             Df = obj.getPatternInDataFrame()
#
#             memUSS = obj.getMemoryUSS()
#
#             print("Total Memory in USS:", memUSS)
#
#             memRSS = obj.getMemoryRSS()
#
#             print("Total Memory in RSS", memRSS)
#
#             run = obj.getRuntime()
#
#             print("Total ExecutionTime in seconds:", run)





__copyright__ = """
Copyright (C)  2021 Rage Uday Kiran

     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.

     This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

from PAMI.multipleMinimumSupportBasedFrequentPattern.basic import abstract as _fp
from deprecated import deprecated

_fp._sys.setrecursionlimit(20000)
# MIS = {}

class _Node:
    """
    A class used to represent the node of frequentPatternTree

    :Attributes:

        itemId: int
            storing item of a node
        counter: int
            To maintain the support of node
        parent: node
            To maintain the parent of node
        children: list
            To maintain the children of node

    :Methods:

        addChild(node)
            Updates the nodes children list and parent for the given node

    """

    def __init__(self, item, children):
        self.itemId = item
        self.counter = 1
        self.parent = None
        self.children = children

    def addChild(self, node):
        """
        Retrieving the child from the tree

        :param node: Children node.
        :type node: Node
        :return: Updates the children nodes and parent nodes
        """
        self.children[node.itemId] = node
        node.parent = self


class _Tree:
    """
    A class used to represent the frequentPatternGrowth tree structure

    :Attributes:

        root : Node
            The first node of the tree set to Null.
        summaries : dictionary
            Stores the nodes itemId which shares same itemId
        info : dictionary
            frequency of items in the transactions

    :Methods:

        addTransaction(transaction, freq)
            adding items of  transactions into the tree as nodes and freq is the count of nodes
        getFinalConditionalPatterns(node)
            getting the conditional patterns from fp-tree for a node
        getConditionalPatterns(patterns, frequencies)
            sort the patterns by removing the items with lower minSup
        generatePatterns(prefix)
            generating the patterns from fp-tree
    """

    def __init__(self):
        self.root = _Node(None, {})
        self.summaries = {}
        self.info = {}

    def addTransaction(self, transaction, count):
        """
        adding transaction into tree

        :param transaction: it represents the one transactions in database
        :type transaction: list
        :param count: frequency of item
        :type count: int
        """

        # This method takes transaction as input and returns the tree
        currentNode = self.root
        for i in range(len(transaction)):
            if transaction[i] not in currentNode.children:
                newNode = _Node(transaction[i], {})
                newNode.freq = count
                currentNode.addChild(newNode)
                if transaction[i] in self.summaries:
                    self.summaries[transaction[i]].append(newNode)
                else:
                    self.summaries[transaction[i]] = [newNode]
                currentNode = newNode
            else:
                currentNode = currentNode.children[transaction[i]]
                currentNode.freq += count

    def getFinalConditionalPatterns(self, alpha, support):
        """
        generates the conditional patterns for a node

        :param alpha: node to generate conditional patterns
        :param support: support value
        :return: returns conditional patterns, frequency of each item in conditional patterns

        """
        finalPatterns = []
        finalFreq = []
        for i in self.summaries[alpha]:
            set1 = i.freq
            set2 = []
            while i.parent.itemId is not None:
                set2.append(i.parent.itemId)
                i = i.parent
            if len(set2) > 0:
                set2.reverse()
                finalPatterns.append(set2)
                finalFreq.append(set1)
        finalPatterns, finalFreq, info = self.getConditionalTransactions(finalPatterns, finalFreq, support)
        return finalPatterns, finalFreq, info

    @staticmethod
    def getConditionalTransactions(ConditionalPatterns, conditionalFreq, support):
        """
        To calculate the frequency of items in conditional patterns and sorting the patterns

        :param ConditionalPatterns: paths of a node
        :param conditionalFreq: frequency of each item in the path
        :param support: support value
        :return: conditional patterns and frequency of each item in transactions
        """
        #global _minSup
        pat = []
        freq = []
        data1 = {}
        for i in range(len(ConditionalPatterns)):
            for j in ConditionalPatterns[i]:
                if j in data1:
                    data1[j] += conditionalFreq[i]
                else:
                    data1[j] = conditionalFreq[i]
        up_dict = {k: v for k, v in data1.items() if v >= support}
        #up_dict = data1.copy()
        count = 0
        for p in ConditionalPatterns:
            p1 = [v for v in p if v in up_dict]
            trans = sorted(p1, key=lambda x: (up_dict.get(x), -x), reverse=True)
            if len(trans) > 0:
                pat.append(trans)
                freq.append(conditionalFreq[count])
            count += 1
        return pat, freq, up_dict

    def generatePatterns(self, prefix):
        """
        To generate the frequent patterns

        :param prefix: an empty list
        :return: Frequent patterns that are extracted from fp-tree

        """
        global minMIS, MIS
        for i in sorted(self.summaries, key=lambda x: (self.info.get(x), -x)):
            pattern = prefix[:]
            pattern.append(i)
            sup = []
            for j in pattern:
                sup.append(MIS[j])
            #if self.info[i] >= minMIS:
            if self.info[i] >= min(sup):
              yield pattern, self.info[i]
            #patterns, freq, info = self.getFinalConditionalPatterns(i, min(sup))
            patterns, freq, info = self.getFinalConditionalPatterns(i, minMIS)
            conditionalTree = _Tree()
            conditionalTree.info = info.copy()
            for pat in range(len(patterns)):
                conditionalTree.addTransaction(patterns[pat], freq[pat])
            if len(patterns) > 0:
                for q in conditionalTree.generatePatterns(pattern):
                    yield q

minMIS = 0
MIS = {}

[docs] class CFPGrowthPlus(_fp._frequentPatterns): """ **About this algorithm** :**Description**: This code implements the CFPGrowthPlus algorithm for mining frequent patterns with multiple minimum support thresholds from a transactional dataset. :**Reference**: R. Uday Kiran P. Krishna Reddy Novel techniques to reduce search space in multiple minimum supports-based frequent pattern mining algorithms. 11-20 2011 EDBT https://doi.org/10.1145/1951365.1951370 :**Parameters**: - **iFile** (*str*) -- Name of the Input file to mine complete set of Uncertain Multiple Minimum Support Based Frequent patterns. - **oFile** (*str*) -- Name of the output file to store complete set of Uncertain Minimum Support Based Frequent patterns. - **MIS** (*str*) -- Name of the MIS file to mine complete set of Uncertain Multiple Minimum Support Based Frequent patterns. - **sep** (*str*) -- This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator. :**Attributes**: - **startTime** (*float*) -- *To record the start time of the mining process.* - **endTime** (*float*) -- *To record the completion time of the mining process.* - **finalPatterns** (*dict*) -- *Storing the complete set of patterns in a dictionary variable.* - **memoryUSS** (*float*) -- *To store the total amount of USS memory consumed by the program.* - **memoryRSS** (*float*) -- *To store the total amount of RSS memory consumed by the program.* - **Database** (*list*) -- *To store the transactions of a database in list.* - **mapSupport** (*Dictionary*) -- *To maintain the information of item and their frequency.* - **tree** (*class*) -- *it represents the Tree class.* :**Methods**: - **mine()** -- *Mining process will start from here.* - **getPatterns()** -- *Complete set of patterns will be retrieved with this function.* - **savePatterns(oFile)** -- *Complete set of frequent patterns will be loaded in to a output file.* - **getPatternsAsDataFrame()** -- *Complete set of frequent patterns will be loaded in to a dataframe.* - **getMemoryUSS()** -- *Total amount of USS memory consumed by the mining process will be retrieved from this function.* - **getMemoryRSS()** -- *Total amount of RSS memory consumed by the mining process will be retrieved from this function.* - **getRuntime()** -- *Total amount of runtime taken by the mining process will be retrieved from this function.* - **creatingItemSets()** -- *Scans the dataset or dataframes and stores in list format.* - **frequentOneItem()** -- *Extracts the one-frequent patterns from transactions.* **Execution methods** **Terminal command** .. code-block:: console Format: (.venv) $ python3 CFPGrowthPlus.py <inputFile> <outputFile> <MISFile> Example Usage: (.venv) $ python3 CFPGrowthPlus.py sampleDB.txt patterns.txt MISFile.txt .. note:: minSup will be considered in support count or frequency **Calling from a python program** .. code-block:: python from PAMI.multipleMinimumSupportBasedFrequentPattern.basic import CFPGrowthPlus as alg iFile = "sample.txt" MIS = "MIS.txt" obj = alg.CFPGrowthPlus(iFile, MIS, sep) obj.mine() frequentPatterns = obj.getPatterns() print("Total number of Frequent Patterns:", len(frequentPatterns)) obj.savePatterns(oFile) Df = obj.getPatternInDataFrame() memUSS = obj.getMemoryUSS() print("Total Memory in USS:", memUSS) memRSS = obj.getMemoryRSS() print("Total Memory in RSS", memRSS) run = obj.getRuntime() print("Total ExecutionTime in seconds:", run) **Credits** The complete program was written by P.Likhitha under the supervision of Professor Rage Uday Kiran.\n """ __startTime = float() __endTime = float() _MIS = str __finalPatterns = {} _iFile = " " _oFile = " " _sep = " " __memoryUSS = float() __memoryRSS = float() __Database = [] __mapSupport = {} __lno = 0 __tree = _Tree() __rank = {} __rankDup = {} def __init__(self, iFile, MIS, sep='\t'): super().__init__(iFile, MIS, sep) def __creatingItemSets(self): """ Storing the complete transactions of the database/input file in a database variable """ self.__Database = [] if isinstance(self._iFile, _fp._pd.DataFrame): if self._iFile.empty: print("its empty..") i = self._iFile.columns.values.tolist() if 'Transactions' in i: self.__Database = self._iFile['Transactions'].tolist() # print(self.Database) if isinstance(self._iFile, str): if _fp._validators.url(self._iFile): data = _fp._urlopen(self._iFile) for line in data: line = line.strip() line = line.decode("utf-8") temp = [i.rstrip() for i in line.split(self._sep)] temp = [x for x in temp if x] self.__Database.append(temp) else: try: with open(self._iFile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() temp = [i.rstrip() for i in line.split(self._sep)] temp = [x for x in temp if x] self.__Database.append(temp) except IOError: print("File Not Found") quit() def _getMISValues(self): """ Storing the Minimum supports given by the user for each item in the database """ self._MISValues = {} if isinstance(self._MIS, _fp._pd.DataFrame): items, MIS = [], [] if self._MIS.empty: print("its empty..") i = self._MIS.columns.values.tolist() if 'items' in i: items = self._MIS['items'].tolist() if 'MIS' in i: MIS = self._MIS['MIS'].tolist() for i in range(len(items)): self._MISValues[items[i]] = MIS[i] if isinstance(self._MIS, str): if _fp._validators.url(self._MIS): data = _fp._urlopen(self._MIS) for line in data: line = line.strip() line = line.decode("utf-8") temp = [i.rstrip() for i in line.split(self._sep)] temp = [x for x in temp if x] self._MISValues[temp[0]] = int(temp[1]) else: try: with open(self._MIS, 'r', encoding='utf-8') as f: for line in f: line = line.strip() temp = [i.rstrip() for i in line.split(self._sep)] temp = [x for x in temp if x] self._MISValues[temp[0]] = int(temp[1]) except IOError: print("File Not Found") quit() def __convert(self, value): """ to convert the type of user specified minSup value :param value: user specified minSup value :return: converted type """ if type(value) is int: value = int(value) if type(value) is float: value = (len(self.__Database) * value) if type(value) is str: if '.' in value: value = float(value) value = (len(self.__Database) * value) else: value = int(value) return value def __frequentOneItem(self): """ Generating One frequent items sets """ global minMIS self.__mapSupport = {} for tr in self.__Database: for i in range(0, len(tr)): if tr[i] not in self.__mapSupport: self.__mapSupport[tr[i]] = 1 else: self.__mapSupport[tr[i]] += 1 self.__mapSupport = {k: v for k, v in self.__mapSupport.items() if v >= min(self._MISValues.values())} minMIS = min(self._MISValues.values()) genList = [k for k, v in sorted(self.__mapSupport.items(), key=lambda x: x[1], reverse=True)] self.__rank = dict([(index, item) for (item, index) in enumerate(genList)]) return genList def __updateTransactions(self, itemSet): """ Updates the items in transactions with rank of items according to their support :Example: oneLength = {'a':7, 'b': 5, 'c':'4', 'd':3} rank = {'a':0, 'b':1, 'c':2, 'd':3} :param itemSet: list of one-frequent items """ list1 = [] for tr in self.__Database: list2 = [] for i in range(len(tr)): if tr[i] in itemSet: list2.append(self.__rank[tr[i]]) if len(list2) >= 1: list2.sort() list1.append(list2) return list1 @staticmethod def __buildTree(transactions, info): """ Builds the tree with updated transactions :param transactions: updated transactions :param info: support details of each item in transactions. :return: transactions compressed in fp-tree """ rootNode = _Tree() rootNode.info = info.copy() for i in range(len(transactions)): rootNode.addTransaction(transactions[i], 1) return rootNode def __savePeriodic(self, itemSet): """ The duplication items and their ranks :param itemSet: frequent itemSet that generated :return: patterns with original item names. """ temp = str() for i in itemSet: temp = temp + self.__rankDup[i] + " " return temp
[docs] @deprecated("It is recommended to use mine() instead of mine() for mining process") def startMine(self): """ main program to start the operation """ self.mine()
[docs] def mine(self): """ main program to start the operation """ global MIS self.__startTime = _fp._time.time() if self._iFile is None: raise Exception("Please enter the file path or file name:") self.__creatingItemSets() self._getMISValues() itemSet = self.__frequentOneItem() updatedTransactions = self.__updateTransactions(itemSet) for x, y in self.__rank.items(): MIS[y] = self._MISValues[x] self.__rankDup[y] = x info = {self.__rank[k]: v for k, v in self.__mapSupport.items()} __Tree = self.__buildTree(updatedTransactions, info) patterns = __Tree.generatePatterns([]) self.__finalPatterns = {} for k in patterns: s = self.__savePeriodic(k[0]) self.__finalPatterns[str(s)] = k[1] print("Frequent patterns were generated successfully using Conditional Frequent Pattern Growth algorithm") self.__endTime = _fp._time.time() self.__memoryUSS = float() self.__memoryRSS = float() process = _fp._psutil.Process(_fp._os.getpid()) self.__memoryUSS = process.memory_full_info().uss self.__memoryRSS = process.memory_info().rss
[docs] def getMemoryUSS(self): """Total amount of USS memory consumed by the mining process will be retrieved from this function :return: returning USS memory consumed by the mining process :rtype: float """ return self.__memoryUSS
[docs] def getMemoryRSS(self): """Total amount of RSS memory consumed by the mining process will be retrieved from this function :return: returning RSS memory consumed by the mining process :rtype: float """ return self.__memoryRSS
[docs] def getRuntime(self): """Calculating the total amount of runtime taken by the mining process :return: returning total amount of runtime taken by the mining process :rtype: float """ return self.__endTime - self.__startTime
[docs] def getPatternsAsDataFrame(self): """Storing final frequent patterns in a dataframe :return: returning frequent patterns in a dataframe :rtype: pd.DataFrame """ dataframe = {} data = [] for a, b in self.__finalPatterns.items(): data.append([a, b]) dataframe = _fp._pd.DataFrame(data, columns=['Patterns', 'Support']) return dataframe
[docs] def save(self, outFile): """Complete set of frequent patterns will be loaded in to a output file :param outFile: name of the output file :type outFile: file """ self._oFile = outFile writer = open(self._oFile, 'w+') for x, y in self.__finalPatterns.items(): s1 = x + ":" + str(y) writer.write("%s \n" % s1)
[docs] def getPatterns(self): """ Function to send the set of frequent patterns after completion of the mining process :return: returning frequent patterns :rtype: dict """ return self.__finalPatterns
[docs] def printResults(self) -> None: """ this function is used to print the results :return: None """ print("Total number of Frequent Patterns:", len(self.getPatterns())) print("Total Memory in USS:", self.getMemoryUSS()) print("Total Memory in RSS", self.getMemoryRSS()) print("Total ExecutionTime in ms:", self.getRuntime())
if __name__ == "__main__": _ap = str() if len(_fp._sys.argv) == 4 or len(_fp._sys.argv) == 5: if len(_fp._sys.argv) == 5: _ap = CFPGrowthPlus(_fp._sys.argv[1], _fp._sys.argv[3], _fp._sys.argv[4]) if len(_fp._sys.argv) == 4: _ap = CFPGrowthPlus(_fp._sys.argv[1], _fp._sys.argv[3]) _ap.mine() _Patterns = _ap.getPatterns() print("Total number of Frequent Patterns:", len(_Patterns)) _ap.savePatterns(_fp._sys.argv[2]) _memUSS = _ap.getMemoryUSS() print("Total Memory in USS:", _memUSS) _memRSS = _ap.getMemoryRSS() print("Total Memory in RSS", _memRSS) _run = _ap.getRuntime() print("Total ExecutionTime in ms:", _run) else: print("Error! The number of input parameters do not match the total number of parameters provided")