| package mining; |
| import java.io.*; |
| import java.util.*; |
| /** The class encapsulates an implementation of the Apriori algorithm |
| * to compute frequent itemsets. |
| * |
| * Datasets contains integers (>=0) separated by spaces, one transaction by line, e.g. |
| * 1 2 3 |
| * 0 9 |
| * 1 9 |
| * |
| * Usage with the command line : |
| * $ java mining.Apriori fileName support |
| * $ java mining.Apriori /tmp/data.dat 0.8 |
| * $ java mining.Apriori /tmp/data.dat 0.8 > frequent-itemsets.txt |
| * |
| * Usage as library: see {@link ExampleOfClientCodeOfApriori} |
| * |
| * @author Martin Monperrus, University of Darmstadt, 2010 |
| * @author Nathan Magnus and Su Yibin, under the supervision of Howard Hamilton, University of Regina, June 2009. |
| * GNU General Public License v3 |
| * No reproduction in whole or part without maintaining this copyright notice |
| * and imposing this condition on any subsequent users. |
| */ |
| public class Apriori extends Observable { |
| public static void main(String[] args) throws Exception { |
| Apriori ap = new Apriori(args); |
| } |
| /** the list of current itemsets */ |
| private List<int[]> itemsets ; |
| /** the name of the transcation file */ |
| private String transaFile="transa.txt"; |
| /** number of different items in the dataset */ |
| private int numItems; |
| /** total number of transactions in transaFile */ |
| private int numTransactions; |
| /** minimum support for a frequent itemset in percentage, e.g. 0.8 */ |
| private double minSup; |
| private boolean usedAsLibrary = false; |
| /** This is the main interface to use this class as a library */ |
| public Apriori(String[] args, Observer ob) throws Exception |
| { |
| usedAsLibrary = true; |
| configure(args); |
| this.addObserver(ob); |
| go(); |
| } |
| /** generates the apriori itemsets from a file |
| * |
| * @param args configuration parameters: args[0] is a filename, args[1] the min support (e.g. 0.8 for 80%) |
| */ |
| public Apriori(String[] args) throws Exception |
| { |
| configure(args); |
| go(); |
| } |
| /** starts the algorithm after configuration */ |
| private void go() throws Exception { |
| //start timer |
| long start = System.currentTimeMillis(); |
| // first we generate the candidates of size 1 |
| createItemetsOfSize1(); |
| int itemsetNumber=1; //the current itemset being looked at |
| int nbFrequentSets=0; |
| while (itemsets.size()>0) |
| { |
| calculateFrequentItemsets(); |
| if(itemsets.size()!=0) |
| { |
| nbFrequentSets+=itemsets.size(); |
| log("Found "+itemsets.size()+" frequent itemsets of size " + itemsetNumber + " (with support "+(minSup*100)+"%)");; |
| createNewItemsetsFromPreviousOnes(); |
| } |
| itemsetNumber++; |
| } |
| //display the execution time |
| long end = System.currentTimeMillis(); |
| log("Execution time is: "+((double)(end-start)/1000) + " seconds."); |
| log("Found "+nbFrequentSets+ " frequents sets for support "+(minSup*100)+"% (absolute "+Math.round(numTransactions*minSup)+")"); |
| log("Done"); |
| } |
| /** triggers actions if a frequent item set has been found */ |
| private void foundFrequentItemSet(int[] itemset, int support) { |
| if (usedAsLibrary) { |
| this.setChanged(); |
| notifyObservers(itemset); |
| } |
| else {System.out.println(Arrays.toString(itemset) + " ("+ ((support / (double) numTransactions))+" "+support+")");} |
| } |
| /** outputs a message in Sys.err if not used as library */ |
| private void log(String message) { |
| if (!usedAsLibrary) { |
| System.err.println(message); |
| } |
| } |
| /** computes numItems, numTransactions, and sets minSup */ |
| private void configure(String[] args) throws Exception |
| { |
| // setting transafile |
| if (args.length!=0) transaFile = args[0]; |
| else transaFile = "chess.dat"; // default |
| // setting minsupport |
| if (args.length>=2) minSup=(Double.valueOf(args[1]).doubleValue()); |
| else minSup = .8;// by default |
| if (minSup>1 || minSup<0) throw new Exception("minSup: bad value"); |
| // going thourgh the file to compute numItems and numTransactions |
| numItems = 0; |
| numTransactions=0; |
| BufferedReader data_in = new BufferedReader(new FileReader(transaFile)); |
| while (data_in.ready()) { |
| String line=data_in.readLine(); |
| if (line.matches("//s*")) continue; // be friendly with empty lines |
| numTransactions++; |
| StringTokenizer t = new StringTokenizer(line," "); |
| while (t.hasMoreTokens()) { |
| int x = Integer.parseInt(t.nextToken()); |
| //log(x); |
| if (x+1>numItems) numItems=x+1; |
| } |
| } |
| outputConfig(); |
| } |
| /** outputs the current configuration |
| */ |
| private void outputConfig() { |
| //output config info to the user |
| log("Input configuration: "+numItems+" items, "+numTransactions+" transactions, "); |
| log("minsup = "+minSup+"%"); |
| } |
| /** puts in itemsets all sets of size 1, |
| * i.e. all possibles items of the datasets |
| */ |
| private void createItemetsOfSize1() { |
| itemsets = new ArrayList<int[]>(); |
| for(int i=0; i<numItems; i++) |
| { |
| int[] cand = {i}; |
| itemsets.add(cand); |
| } |
| } |
| /** |
| * if m is the size of the current itemsets, |
| * generate all possible itemsets of size n+1 from pairs of current itemsets |
| * replaces the itemsets of itemsets by the new ones |
| */ |
| private void createNewItemsetsFromPreviousOnes() |
| { |
| // by construction, all existing itemsets have the same size |
| int currentSizeOfItemsets = itemsets.get(0).length; |
| log("Creating itemsets of size "+(currentSizeOfItemsets+1)+" based on "+itemsets.size()+" itemsets of size "+currentSizeOfItemsets); |
| HashMap<String, int[]> tempCandidates = new HashMap<String, int[]>(); //temporary candidates |
| // compare each pair of itemsets of size n-1 |
| for(int i=0; i<itemsets.size(); i++) |
| { |
| for(int j=i+1; j<itemsets.size(); j++) |
| { |
| int[] X = itemsets.get(i); |
| int[] Y = itemsets.get(j); |
| assert (X.length==Y.length); |
| //make a string of the first n-2 tokens of the strings |
| int [] newCand = new int[currentSizeOfItemsets+1]; |
| for(int s=0; s<newCand.length-1; s++) { |
| newCand[s] = X[s]; |
| } |
| int ndifferent = 0; |
| // then we find the missing value |
| for(int s1=0; s1<Y.length; s1++) |
| { |
| boolean found = false; |
| // is Y[s1] in X? |
| for(int s2=0; s2<X.length; s2++) { |
| if (X[s2]==Y[s1]) { |
| found = true; |
| break; |
| } |
| } |
| if (!found){ // Y[s1] is not in X |
| ndifferent++; |
| // we put the missing value at the end of newCand |
| newCand[newCand.length -1] = Y[s1]; |
| } |
| } |
| // we have to find at least 1 different, otherwise it means that we have two times the same set in the existing candidates |
| assert(ndifferent>0); |
| if (ndifferent==1) { |
| // HashMap does not have the correct "equals" for int[] :-( |
| // I have to create the hash myself using a String :-( |
| // I use Arrays.toString to reuse equals and hashcode of String |
| Arrays.sort(newCand); |
| tempCandidates.put(Arrays.toString(newCand),newCand); |
| } |
| } |
| } |
| //set the new itemsets |
| itemsets = new ArrayList<int[]>(tempCandidates.values()); |
| log("Created "+itemsets.size()+" unique itemsets of size "+(currentSizeOfItemsets+1)); |
| } |
| /** put "true" in trans[i] if the integer i is in line */ |
| private void line2booleanArray(String line, boolean[] trans) { |
| Arrays.fill(trans, false); |
| StringTokenizer stFile = new StringTokenizer(line, " "); //read a line from the file to the tokenizer |
| //put the contents of that line into the transaction array |
| while (stFile.hasMoreTokens()) |
| { |
| int parsedVal = Integer.parseInt(stFile.nextToken()); |
| trans[parsedVal]=true; //if it is not a 0, assign the value to true |
| } |
| } |
| /** passes through the data to measure the frequency of sets in {@link itemsets}, |
| * then filters thoses who are under the minimum support (minSup) |
| */ |
| private void calculateFrequentItemsets() throws Exception |
| { |
| log("Passing through the data to compute the frequency of " + itemsets.size()+ " itemsets of size "+itemsets.get(0).length); |
| List<int[]> frequentCandidates = new ArrayList<int[]>(); //the frequent candidates for the current itemset |
| boolean match; //whether the transaction has all the items in an itemset |
| int count[] = new int[itemsets.size()]; //the number of successful matches, initialized by zeros |
| // load the transaction file |
| BufferedReader data_in = new BufferedReader(new InputStreamReader(new FileInputStream(transaFile))); |
| boolean[] trans = new boolean[numItems]; |
| // for each transaction |
| for (int i = 0; i < numTransactions; i++) { |
| // boolean[] trans = extractEncoding1(data_in.readLine()); |
| String line = data_in.readLine(); |
| line2booleanArray(line, trans); |
| // check each candidate |
| for (int c = 0; c < itemsets.size(); c++) { |
| match = true; // reset match to false |
| // tokenize the candidate so that we know what items need to be |
| // present for a match |
| int[] cand = itemsets.get(c); |
| //int[] cand = candidatesOptimized[c]; |
| // check each item in the itemset to see if it is present in the |
| // transaction |
| for (int xx : cand) { |
| if (trans[xx] == false) { |
| match = false; |
| break; |
| } |
| } |
| if (match) { // if at this point it is a match, increase the count |
| count[c]++; |
| //log(Arrays.toString(cand)+" is contained in trans "+i+" ("+line+")"); |
| } |
| } |
| } |
| data_in.close(); |
| for (int i = 0; i < itemsets.size(); i++) { |
| // if the count% is larger than the minSup%, add to the candidate to |
| // the frequent candidates |
| if ((count[i] / (double) (numTransactions)) >= minSup) { |
| foundFrequentItemSet(itemsets.get(i),count[i]); |
| frequentCandidates.add(itemsets.get(i)); |
| } |
| //else log("-- Remove candidate: "+ Arrays.toString(candidates.get(i)) + " is: "+ ((count[i] / (double) numTransactions))); |
| } |
| //new candidates are only the frequent candidates |
| itemsets = frequentCandidates; |
| } |
| } |
1万+

被折叠的 条评论
为什么被折叠?



