Apriori算法

最新推荐文章于 2024-08-28 22:29:36 发布

转载最新推荐文章于 2024-08-28 22:29:36 发布 · 1k 阅读

文章标签：

#exception #string #transactions #library #algorithm #construction

数据挖掘专栏收录该内容

2 篇文章

订阅专栏

package mining;

import java.io.*;

import java.util.*;

/** The class encapsulates an implementation of the Apriori algorithm

* to compute frequent itemsets.

* Datasets contains integers (>=0) separated by spaces, one transaction by line, e.g.

* 1 2 3

* 0 9

* 1 9

* Usage with the command line :

* $ java mining.Apriori fileName support

* $ java mining.Apriori /tmp/data.dat 0.8

* $ java mining.Apriori /tmp/data.dat 0.8 > frequent-itemsets.txt

* Usage as library: see {@link ExampleOfClientCodeOfApriori}

* @author Martin Monperrus, University of Darmstadt, 2010

* @author Nathan Magnus and Su Yibin, under the supervision of Howard Hamilton, University of Regina, June 2009.

* GNU General Public License v3

* No reproduction in whole or part without maintaining this copyright notice

* and imposing this condition on any subsequent users.

public class Apriori extends Observable {

public static void main(String[] args) throws Exception {

Apriori ap = new Apriori(args);

}

/** the list of current itemsets */

private List<int[]> itemsets ;

/** the name of the transcation file */

private String transaFile="transa.txt";

/** number of different items in the dataset */

private int numItems;

/** total number of transactions in transaFile */

private int numTransactions;

/** minimum support for a frequent itemset in percentage, e.g. 0.8 */

private double minSup;

private boolean usedAsLibrary = false;

/** This is the main interface to use this class as a library */

public Apriori(String[] args, Observer ob) throws Exception

{

usedAsLibrary = true;

configure(args);

this.addObserver(ob);

go();

}

/** generates the apriori itemsets from a file

* @param args configuration parameters: args[0] is a filename, args[1] the min support (e.g. 0.8 for 80%)

public Apriori(String[] args) throws Exception

{

configure(args);

go();

}

/** starts the algorithm after configuration */

private void go() throws Exception {

//start timer

long start = System.currentTimeMillis();

// first we generate the candidates of size 1

createItemetsOfSize1();

int itemsetNumber=1; //the current itemset being looked at

int nbFrequentSets=0;

while (itemsets.size()>0)

{

calculateFrequentItemsets();

if(itemsets.size()!=0)

{

nbFrequentSets+=itemsets.size();

log("Found "+itemsets.size()+" frequent itemsets of size " + itemsetNumber + " (with support "+(minSup*100)+"%)");;

createNewItemsetsFromPreviousOnes();

}

itemsetNumber++;

}

//display the execution time

long end = System.currentTimeMillis();

log("Execution time is: "+((double)(end-start)/1000) + " seconds.");

log("Found "+nbFrequentSets+ " frequents sets for support "+(minSup*100)+"% (absolute "+Math.round(numTransactions*minSup)+")");

log("Done");

}

/** triggers actions if a frequent item set has been found */

private void foundFrequentItemSet(int[] itemset, int support) {

if (usedAsLibrary) {

this.setChanged();

notifyObservers(itemset);

}

else {System.out.println(Arrays.toString(itemset) + " ("+ ((support / (double) numTransactions))+" "+support+")");}

}

/** outputs a message in Sys.err if not used as library */

private void log(String message) {

if (!usedAsLibrary) {

System.err.println(message);

}

/** computes numItems, numTransactions, and sets minSup */

private void configure(String[] args) throws Exception

{

// setting transafile

if (args.length!=0) transaFile = args[0];

else transaFile = "chess.dat"; // default

// setting minsupport

if (args.length>=2) minSup=(Double.valueOf(args[1]).doubleValue());

else minSup = .8;// by default

if (minSup>1 || minSup<0) throw new Exception("minSup: bad value");

// going thourgh the file to compute numItems and numTransactions

numItems = 0;

numTransactions=0;

BufferedReader data_in = new BufferedReader(new FileReader(transaFile));

while (data_in.ready()) {

String line=data_in.readLine();

if (line.matches("//s*")) continue; // be friendly with empty lines

numTransactions++;

StringTokenizer t = new StringTokenizer(line," ");

while (t.hasMoreTokens()) {

int x = Integer.parseInt(t.nextToken());

//log(x);

if (x+1>numItems) numItems=x+1;

}

outputConfig();

}

/** outputs the current configuration

private void outputConfig() {

//output config info to the user

log("Input configuration: "+numItems+" items, "+numTransactions+" transactions, ");

log("minsup = "+minSup+"%");

}

/** puts in itemsets all sets of size 1,

* i.e. all possibles items of the datasets

private void createItemetsOfSize1() {

itemsets = new ArrayList<int[]>();

for(int i=0; i<numItems; i++)

{

int[] cand = {i};

itemsets.add(cand);

}

/**

* if m is the size of the current itemsets,

* generate all possible itemsets of size n+1 from pairs of current itemsets

* replaces the itemsets of itemsets by the new ones

private void createNewItemsetsFromPreviousOnes()

{

// by construction, all existing itemsets have the same size

int currentSizeOfItemsets = itemsets.get(0).length;

log("Creating itemsets of size "+(currentSizeOfItemsets+1)+" based on "+itemsets.size()+" itemsets of size "+currentSizeOfItemsets);

HashMap<String, int[]> tempCandidates = new HashMap<String, int[]>(); //temporary candidates

// compare each pair of itemsets of size n-1

for(int i=0; i<itemsets.size(); i++)

{

for(int j=i+1; j<itemsets.size(); j++)

{

int[] X = itemsets.get(i);

int[] Y = itemsets.get(j);

assert (X.length==Y.length);

//make a string of the first n-2 tokens of the strings

int [] newCand = new int[currentSizeOfItemsets+1];

for(int s=0; s<newCand.length-1; s++) {

newCand[s] = X[s];

}

int ndifferent = 0;

// then we find the missing value

for(int s1=0; s1<Y.length; s1++)

{

boolean found = false;

// is Y[s1] in X?

for(int s2=0; s2<X.length; s2++) {

if (X[s2]==Y[s1]) {

found = true;

break;

}

if (!found){ // Y[s1] is not in X

ndifferent++;

// we put the missing value at the end of newCand

newCand[newCand.length -1] = Y[s1];

}

// we have to find at least 1 different, otherwise it means that we have two times the same set in the existing candidates

assert(ndifferent>0);

if (ndifferent==1) {

// HashMap does not have the correct "equals" for int[] :-(

// I have to create the hash myself using a String :-(

// I use Arrays.toString to reuse equals and hashcode of String

Arrays.sort(newCand);

tempCandidates.put(Arrays.toString(newCand),newCand);

}

//set the new itemsets

itemsets = new ArrayList<int[]>(tempCandidates.values());

log("Created "+itemsets.size()+" unique itemsets of size "+(currentSizeOfItemsets+1));

}

/** put "true" in trans[i] if the integer i is in line */

private void line2booleanArray(String line, boolean[] trans) {

Arrays.fill(trans, false);

StringTokenizer stFile = new StringTokenizer(line, " "); //read a line from the file to the tokenizer

//put the contents of that line into the transaction array

while (stFile.hasMoreTokens())

{

int parsedVal = Integer.parseInt(stFile.nextToken());

trans[parsedVal]=true; //if it is not a 0, assign the value to true

}

/** passes through the data to measure the frequency of sets in {@link itemsets},

* then filters thoses who are under the minimum support (minSup)

private void calculateFrequentItemsets() throws Exception

{

log("Passing through the data to compute the frequency of " + itemsets.size()+ " itemsets of size "+itemsets.get(0).length);

List<int[]> frequentCandidates = new ArrayList<int[]>(); //the frequent candidates for the current itemset

boolean match; //whether the transaction has all the items in an itemset

int count[] = new int[itemsets.size()]; //the number of successful matches, initialized by zeros

// load the transaction file

BufferedReader data_in = new BufferedReader(new InputStreamReader(new FileInputStream(transaFile)));

boolean[] trans = new boolean[numItems];

// for each transaction

for (int i = 0; i < numTransactions; i++) {

// boolean[] trans = extractEncoding1(data_in.readLine());

String line = data_in.readLine();

line2booleanArray(line, trans);

// check each candidate

for (int c = 0; c < itemsets.size(); c++) {

match = true; // reset match to false

// tokenize the candidate so that we know what items need to be

// present for a match

int[] cand = itemsets.get(c);

//int[] cand = candidatesOptimized[c];

// check each item in the itemset to see if it is present in the

// transaction

for (int xx : cand) {

if (trans[xx] == false) {

match = false;

break;

}

if (match) { // if at this point it is a match, increase the count

count[c]++;

//log(Arrays.toString(cand)+" is contained in trans "+i+" ("+line+")");

}

data_in.close();

for (int i = 0; i < itemsets.size(); i++) {

// if the count% is larger than the minSup%, add to the candidate to

// the frequent candidates

if ((count[i] / (double) (numTransactions)) >= minSup) {

foundFrequentItemSet(itemsets.get(i),count[i]);

frequentCandidates.add(itemsets.get(i));

}

//else log("-- Remove candidate: "+ Arrays.toString(candidates.get(i)) + " is: "+ ((count[i] / (double) numTransactions)));

}

//new candidates are only the frequent candidates

itemsets = frequentCandidates;

}