该算法是为了实现对一些专业文章的词汇关联分析而实现的,并不是Apriori的最佳应用,确实对词频分析的一种实践。
package com.my.analysis;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import redis.clients.jedis.Jedis;
public class AprioriMyImpl {
private double minsup = 0.3;// 最小支持度
private double minconf = 0.99;// 最小置信度
private int limitword = 100;// 参加统计的
private ArrayList<Set<String>> aricleWL;//
private ArrayList<Set<Set<String>>> candidateList;//候选项list
private ArrayList<Set<Set<String>>> frequencyList;//频繁项list
public Set<Set<String>> allSub = new HashSet<Set<String>>();//最大频繁项的所有子集
private long filecount;//文件的总数量
private int step = 1;//表示进行到第一步了
private Jedis jedis = new Jedis("localhost", 6379);
public AprioriMyImpl() {
candidateList = new ArrayList<Set<Set<String>>>();
frequencyList = new ArrayList<Set<Set<String>>>();
aricleWL = new ArrayList<Set<String>>();
filecount = jedis.llen(AnsjTxtFileParserForRedis.FILELIST);
for(int i = 0;i < filecount;i++){
aricleWL.add(jedis.smembers(AnsjTxtFileParserForRedis.FILEPREFIX+i));
}
}
/**
* 初始化第一个候选项集合
*/
// public void item1_init(){
// Set<Set<String>> candidate1 = new HashSet<Set<String>>();
// Set<String> tset = jedis.zrevrange(AnsjTxtFileParserForRedis.TABLENAME, 0,limitword-1);
// for(String s:tset){
// HashSet<String> one = new HashSet<String>();
// one.add(s);
// candidate1.add(one);
// }
// candidateList.add(candidate1);
// System.out.println("候选项集-"+(step)+":");
// printSetSetString(candidate1);
// }
public void item1_init(){
String[] keys ={"睡眠","时间","宝宝","治疗","疾病","身体","呼吸","质量","孩子","入睡","人体","精神","习惯","心理","障碍","枕头","保健","关注","医生","女性","症状","食物","饮食","运动","中医","床垫","儿童","婴儿","阅读","大脑","按摩","效果","癫痫","环境","营养","压力","血液","智能","休息","妈妈","男人","生理","医学","社会","药物","肌肉","男性","科技","恢复","减肥","放松","神经","危害","情绪","怀孕","午睡","分泌","下降","反馈","音乐","刺激","糖尿病","姿势","老人","熬夜","消化","记忆","消除","起床","客户","食品","感冒","高血压","招聘","老年人","孕妇","手表","解决","现象","超过","颈椎","全身","空调","侧卧","位置","体温","金笔","达到","打鼾","电视","能量","催眠","物质","状况","精力","作者","设备","价格","病人","保护","数据","经验","正文","适合","妇科","锻炼","新生儿","咳嗽","抑郁症","血管","抑制","幼儿","失眠症","心脏病","食疗","血压","肿瘤","诱发","重视","心血管","寿命","小便","免疫力","月经","评测","记忆力","智力"};
Set<Set<String>> candidate1 = new HashSet<Set<String>>();
for(String s:keys){
HashSet<String> one = new HashSet<String>();
one.add(s);
candidate1.add(one);
}
candidateList.add(candidate1);
System.out.println("候选项集-"+(step)+":");
printSetSetString(candidate1);
}
/**
* 候选项集转化为频繁项集
*/
public boolean candidateToFrequency(){
Set<Set<String>> candItems = candidateList.get(step-1);
Set<Set<String>> freqItems = new HashSet<Set<String>>();
for(Set<String> item:candItems){
if((count_sup(item)/filecount)>=minsup){
freqItems.add(item);
}
}
if(freqItems.size()==0){//无法产生符合条件的频繁项集
return false;
}
frequencyList.add(freqItems);
System.out.println("频繁项集-"+(step)+":");
printSetSetString(freqItems);//输出频繁项集
step++;
return true;
}
/**
* 频繁项集形成新的候选项集
*/
public boolean frequencyToCandidate(){
Set<Set<String>> frequencyItems = frequencyList.get(step-2);
Set<String> maxSub = maxSubSet(frequencyItems);
Set<Set<String>> candidateItems = new HashSet<Set<String>>();
for(Set<String> freqs : frequencyItems){
int len = freqs.size();
for(String sub:maxSub){
Set<String> pItem = new HashSet<String>();
pItem.addAll(freqs);
pItem.add(sub);
if(pItem.size()==(len+1)&&subIsFreq(frequencyItems,pItem)){
candidateItems.add(pItem);
}
}
}
if(candidateItems.size()==0){//没有形成新的候选集
return false;
}
candidateList.add(candidateItems);
System.out.println("候选项集-"+(step)+":");
printSetSetString(candidateItems);//输出频繁项集
return true;
}
/**
* parentSet的子集在频繁集合freq中
* @param freq
* @param parentSet
* @return true 是 ; false 否
*/
public boolean subIsFreq(Set<Set<String>> freq,Set<String> parentSet){
for(String s:parentSet){
Set<String> item = new HashSet<String>();
item.addAll(parentSet);
item.remove(s);
if(!freq.contains(item)){
return false;
}
}
return true;
}
/**
* 获得频繁项集的最大项集
* @param freqIntems
*/
public Set<String> maxSubSet(Set<Set<String>> freqIntems){
Set<String> maxSub = new HashSet<String>();
for(Set<String> ss:freqIntems){
for(String s:ss){
maxSub.add(s);
}
}
return maxSub;
}
/**
* 计算支持度
* @param x
* @return
*/
public double count_sup(Set<String> x){
int temp = 0;
for(Set<String> ss:aricleWL){
if(ss.containsAll(x)){
temp++;
}
}
return temp;
}
/**
* 计算集合x=>y的置信度
* @param x
* @param y
* @return
*/
public double cout_cand(Set<String> x,Set<String> y){
Set<String> z = new HashSet<String>();
z.addAll(x);
z.addAll(y);
return count_sup(z)/count_sup(x);
}
/**
* 获得所有的子集
* @param parent
*/
public void genSub(Set<String> parent){
if(parent.size()>0){
allSub.add(parent);
}
Set<String> ss = new HashSet<String>();
ss.addAll(parent);
for(String s:ss){
Set<String> ss2 = new HashSet<String>();
ss2.addAll(ss);
ss2.remove(s);
genSub(ss2);
}
}
/**
* 输出
* @param sss
*/
public void printSetSetString(Set<Set<String>> sss){
for(Set<String> ss:sss){
System.out.println(ss);
}
}
/**
* 关联度分析
* @param subSet
*/
public void releRuleCount(Set<Set<String>> subSet){
for(Set<String> x:subSet){
for(Set<String> y:subSet){
Set<String> xy = new HashSet<String>();
xy.addAll(x);
xy.addAll(y);
if(xy.size()==(x.size()+y.size())){
double sup_count = cout_cand(x,y);
if(sup_count>minconf){
System.out.println(x+"==>>"+y+"=="+sup_count);
}
}
}
}
}
public void jisuan(){
item1_init();//第一个候选项集的初始化
while(true){
if(!candidateToFrequency())
break;
if(!frequencyToCandidate())
break;
}
Set<Set<String>> maxfreqs = frequencyList.get(frequencyList.size()-1);
for(Set<String> maxfreq:maxfreqs){
allSub = new HashSet<Set<String>>();
genSub(maxfreq);
releRuleCount(allSub);
}
}
public static void main(String[] args) {
//初始化候选项,取前几位word
new AprioriMyImpl().jisuan();
}
}