频繁模式挖掘FP-Growth(Frequent Pattern Growth)

最新推荐文章于 2025-05-22 10:04:50 发布

lengo

最新推荐文章于 2025-05-22 10:04:50 发布

阅读量2.3k

点赞数 1

CC 4.0 BY-SA版权

分类专栏：数据挖掘

本文链接：https://blog.youkuaiyun.com/lengo/article/details/78700265

数据挖掘专栏收录该内容

27 篇文章

订阅专栏

本文详细介绍了FPGrowth算法的实现过程，包括主程序及关键函数的代码与解释，通过两次扫描数据集的方式构建FP树并挖掘频繁模式。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

本算法参照《数据挖掘概念与技术》第三版（韩家炜）来实现的，算法的伪代码如下（截图）：

FPGrowth频繁模式挖掘主程序：

clc;
clear;
%最小支持度设定
min_sup=2;
%最小置信度
min_conf=0.9;
%读取事物数据，按行读
fid=fopen('D:\matlabFile\FPGrowth\dataApriori.txt','r');
%记录读取的行号，与实际的事务数相对应，同时为了分配存储空间
NumEvent=1;
Dataset=cell(1,1);
while ~feof(fid)
    lineinfo=fgetl(fid);
    c=str2num(lineinfo);
     if (isempty(lineinfo))
       continue; %空行，重新读下一行
     end         
     Dataset{NumEvent,1}=c;
     NumEvent=NumEvent+1;
end
 fclose(fid);
NumEvent=NumEvent-1;        
%存放当前频繁项，item
L=zeros(1,2);
%第一次扫描Dataset,统计每一个候选项，并存在二维数组中
%对所有item进行统计，并按支持度递减排序  
NumEvent=size(Dataset,1);
for i=1:NumEvent        
    for j=1:length(Dataset{i})
        %寻找当前项在L中是否已经存在
        result=find(L(:,1)==Dataset{i}(1,j));
        if isempty(result)
            %如不存在，则新建一行，支持度为1
            L=cat(1,L,[Dataset{i}(1,j),1]);
        else
            %如存在，则支持度加1
            L(result,2)=L(result,2)+1;
        end
    end
end
%将L的第一行赋空
L(1,:)=[];
%按照支持度递减排序，
L=sortrows(L,-2);
%存储FP树
tree=cell(1,2);
%第二次扫描事务数据库Dataset，构造FP树，
for i=1:NumEvent
    L_temp=[];
    %把每一个事务中的item按照其支持度进行排序
    for j=1:length(Dataset{i})
        result=find(L(:,1)==Dataset{i}(1,j));
        if isempty(result)==0
            L_temp=cat(1,L_temp,L(result,:));
        end
    end
    L_temp=sortrows(L_temp,-2);
    L_temp=(L_temp(:,1))';
    tree=treeFunc(L_temp,tree,1,i);                
end
%进行模式挖掘
%对L按支持度的降序重新排序
L=sortrows(L,2);
%对L的每一项进行频繁项的挖掘
%最终的频繁项
freqItems=cell(1,1);
%频繁项计数
n=1;
%临时频繁项存储
freqItems_temp=cell(1,1);
for i=1:size(L,1)-1
    frequentItem=cell(1,1);
    item=[];
    %找出所有包含L(i,1)项
    frequentItem=ruleMining(L(i,1),tree,item,frequentItem);
    frequentItem(1,:)=[];
    if isempty(frequentItem)
        continue;
    end
    %构造条件树
    conditionTree=cell(1,2);
    conditionItem=zeros(1,2);
    for j=1:size(frequentItem,1)
        if size(frequentItem{j,1},2)~=0
            flag_2=0;
            p=0;
            %conditionTree的行代表在条件树起始节点null下，有几个分支
            if size(conditionTree{1,1},2)~=0
                for x=1:size(conditionTree,1)
                    if conditionTree{x,1}==frequentItem{j,1}(1,1)
                        conditionTree{x,2}(1,2)=conditionTree{x,2}(1,2)+frequentItem{j,1}(1,2);
                        flag_2=1;
                        p=x;
                        break;
                    end
                end                
            end

            if flag_2==0
                p=size(conditionTree,1)+1;
                conditionTree{p,1}=frequentItem{j,1}(1,1);
                conditionTree{p,2}=[frequentItem{j,1}(1,1),frequentItem{j,1}(1,size(frequentItem{j,1},2))];
                if size(conditionTree{1,1},2)==0
                    conditionTree(1,:)=[];
                    p=p-1;
                end
            end
            if size(conditionTree{p,2},2)==0
                conditionTree{p,2}=conditionItem;
                conditionItem=conditionTree{p,2};
            else
                conditionItem=conditionTree{p,2};
            end

            %验证conditionItem是否已经存在，如存在则直接将频繁项的数量可以直接相加
            for  m=2:size(frequentItem{j,1},2)-1
                flag=0;
                for n=1:size(conditionItem,1)
                    if conditionItem(n,1)==frequentItem{j,1}(1,m)
                        conditionItem(n,2)=conditionItem(n,2)+frequentItem{j,1}(1,size(frequentItem{j,1},2));
                        flag=1;
                        break;
                    end
                end
                if flag==0
                    conditionItem=cat(1,conditionItem,[frequentItem{j,1}(1,m),frequentItem{j,1}(1,size(frequentItem{j,1},2))]);
                end
            end
            conditionTree{p,2}=conditionItem;
        end
    end
    %对生成的条件树进行最小支持度过滤，
    for k=1:size(conditionTree,1)
        %对支持度小于min_sup的进行过滤
        conditionItem=conditionTree{k,2};   
        if isempty(conditionItem)
            continue;
        end
        result=find(conditionItem(:,2)<min_sup);
        row=unique(result);
        conditionItem(row,:)=[];        
        %对剩余item进行组合,需要把当前的
        result=find(conditionItem(:,1)==L(i,1));        
        row=unique(result);
        currentItem=conditionItem(row,:);
        conditionItem(row,:)=[];
        %取出所有item项
        c=conditionItem(:,1)';
        c_temp=cell(1,1);
        %对Item进行组合，并生成频繁项
        for m=1:length(c)
            %组合
            C=nchoosek(1:1:length(c),m);
            %取最小值
            for p=1:size(C,1)
                cItem=currentItem;
                for q=1:size(C,2)
                    cItem=cat(1,cItem,conditionItem(C(p,q),:));
                end
                %求支持度最小的项
                [x,y]=min(cItem(:,2));
                %得到频繁项
                freqItems_temp{n,1}=cItem(:,1)';
                fItem=cat(2,cItem(:,1)',cItem(y,2));
                freqItems{n,1}=fItem;
                n=n+1;
            end
         end
    end
end
%------------------对重复的频繁项进行合并----------------------------------
%临时频繁项集，与上面的频繁项集基本一致，只是少了最后一列的支持度计数，目的
%是为了比较两个频繁项是否一致，以便进行合并
while ~isempty(freqItems_temp)
    for i=1:size(freqItems_temp,1)-1
        flag=0;
        for j=i+1:size(freqItems_temp,1)
            if isequal(freqItems_temp{i,1},freqItems_temp{j,1})
                %将临时频繁项的重复项进行合并
                freqItems_temp(j,:)=[];
                %将重复的频繁项最后的一列的支持度计数进行合并
                freqItems{i,1}(1,size(freqItems{i,1},2))=freqItems{i,1}(1,size(freqItems{i,1},2))+freqItems{j,1}(1,size(freqItems{j,1},2));
                %将频繁项中的重复项清除
                freqItems(j,:)=[];
                flag=1;
                break;
            end
        end
        if flag==1
            break;
        end
    end
    if i==size(freqItems_temp,1)-1
        break;
    end
end
%显示频繁项集
freqItems

ruleMining函数的实现过程：

function result=ruleMining(L,currentNode,item,frequentItem)%L用来保存当前项，currentNode用来保存当前节点，用来保存项集
%开始遍历节点
%这里的Item2是为了保存最上层的item，以便在下面的for循环中保持不变
item2=item;
for i=1:size(currentNode,2)
    item=item2;
   if size(currentNode{1,i},2)~=0
       %如果当前结点已经存在且item为空，则直接跳过该次循环
       if isempty(item)&¤tNode{1,i}{1,1}==L
          continue;
       end
       %如果当前结点的类型等于当前类型，则记录该频繁项
       if currentNode{1,i}{1,1}==L
           %获取当前结点的结点类型，即item
           a=currentNode{1,i}{1,1};
           %获取当前结点的支持度
           b=currentNode{1,i}{1,2};
           %如找到当前项，则把结果保存到项集中，项集的最后一位保存了该项的数量
           item=cat(2,item,[a b]);
           %将当前频繁项加入到频繁项集中
           frequentItem{size(frequentItem,1)+1,1}=item;
           %去掉当前item的最后两列，以便继续进行遍历
           item=item(1:length(item)-2);
       else
%            if size(currentNode,2)>2
               %判断当前节点的第三个cell是否为空，如果不为空，则进行下一级的循环遍历
               if size(currentNode{1,i}{1,3},2)>0
                   %将当前结点的结点类型加入到item中
                   item=cat(2,item,currentNode{1,i}{1,1});
                   %进行下一级的遍历
                   frequentItem=ruleMining(L,currentNode{1,i}{1,3},item,frequentItem);                   
               end
%            else
%                
%            end
       end
   end
end
%返回结果
result=frequentItem;

treeFunc函数的实现过程：

function result=treeFunc(L_temp,currentNode,f,f1)%参数f确保返回的是一个整体的树，f1保证是从第二次开始循环
      
    %初次建树，直接增加一个节点
    if isempty(currentNode{1,1})
        Node=cell(1,2);
        newNode=cell(1,3);
        newNode{1}=L_temp(1,1);
        newNode{2}=1;
        Node{1,1}=newNode;
        currentNode{1,1}=Node;
        Node=currentNode{1,1};       
    else  
        %对已有的树进行遍历
        if f1>1
            flag_1=0;
            for i=1:size(currentNode,2)
                if size(currentNode{1,i})~=0
                    if currentNode{1,i}{1,1}==L_temp(1,1)
                        currentNode{1,i}{1,2}=currentNode{1,i}{1,2}+1;
                        Node=currentNode;
                        flag_1=1;
                        break;
                    end
                end
            end
            %在根节点下增加新的节点
            if flag_1==0
                newNode=cell(1,3);
                newNode{1}=L_temp(1,1);
                newNode{2}=1;
                currentNode{1,size(currentNode,2)+1}=newNode;
                Node=currentNode;
            end
        else
           flag_1=0;
           if flag_1==0 
            %如果cell的第三列为空，则创建的cell（1×2）加入到第三列
            if size(currentNode{1,3},2)==0
                Node=cell(1,2);
                newNode=cell(1,3);
                newNode{1}=L_temp(1,1);
                newNode{2}=1;
                Node{1,1}=newNode;
                currentNode{1,3}=Node;  
                Node=currentNode{1,3};
            else
                %如果第三列不为空，则对其进行遍历，寻找匹配项                
                flag=0;
                for m=1:size(currentNode{1,3},2)  
                    if size(currentNode{1,3}{1,m})~=0
                        if currentNode{1,3}{1,m}{1,1}==L_temp(1,1)
                            flag=1;
                            currentNode{1,3}{1,m}{1,2}=currentNode{1,3}{1,m}{1,2}+1;    
                            Node=currentNode{1,3};
                            break;
                        end
                    end
                end
                %如果没找到匹配项，则增加一个新节点
                if flag==0
                    newNode=cell(1,3);
                    newNode{1}=L_temp(1,1);
                    newNode{2}=1;
                    currentNode{1,3}{1,size(currentNode{1,3},2)+1}=newNode;
                    Node=currentNode{1,3};
                end
            end                                 
           end                           
        end
    end
    const=L_temp(1,1);
    L_temp(:,1)=[];
    if isempty(L_temp)==0
       if size(Node,2)>2
           for i=1:size(Node,2)
               if size(Node{1,i})~=0
                   if Node{1,i}{1,1}==const
                       Node{1,i}{1,3}=treeFunc(L_temp,Node{1,i},0,0); 
                   end
               end
           end
       else
           Node{1,1}{1,3}=treeFunc(L_temp,Node{1,1},0,0);   
       end
       result=Node;
    else
        result=Node;
    end
end

实验数据依然用的是与Apriori算法相同的数据，如下：

dataApriori.txt实验数据文件，请复制后保存为txt文件