欢迎关注鄙人公众号,技术干货随时看!
首先介绍下频繁项集的相关知识!其实频繁项集是针对购物车提出来的,也就是在购物车中频繁出现的物品的集合。
2.相关概念:
关联规则的支持度:Support(A,B)=包含A和B的事务数/事务总数
关联规则的置信度:Confidence(A,B)= 包含A和B的事务数/包含A事务数
频繁项集:项集的频率大于等于最小支持度。
强相关规则:同时满足最小支持度和最小置信度。
3.关联规则挖掘的步骤:
生成频繁项集,然后生成规则
空谈理论是没有实际意义的,本文基于敝人一个实际的工程项目,来介绍如何应用频繁项集进行关联规则推荐物品。基于商业秘密,所用到的数据均进行了处理!本工程通过Map-Reduce实现,由三个map-reduce过程来完成。
第一个map-reduce类:CrossRecommendStep1,生成频繁一项集,输入文件为order_wash.txt,这个文件在具体的项目中一般都是由用户的定单数据统计而来,具体的日志清洗与统计不在本文讨论范畴,其实一个完整的推荐流程是由日志清单、用户常购清单、推荐算法等多个步骤才能完成的,本文专注于交叉推荐算法的实际应用!order_wash.txt 格式如下:
accessTime mem_guid category
2016-04-20 11:31:20 FN05916 CC204316,CC304119,CC404115
2016-04-20 11:31:20 FN05917 CC204315,CC304111,CC404115
2016-04-20 11:31:20 FN05918 CC204314,CC304112,CC404115
2016-04-20 11:31:20 FN05919 CC204311,CC304113,CC404117
2016-04-20 11:31:20 FN05920 CC204311,CC304115,CC404116
2016-04-20 11:31:20 FN05921 CC204311,CC304115,CC404115
下面看CrossRecommendStep1类具本的实现代码,此类主要是统计每个分类下的购买次数(包括不同用户的,一个用户多次购买算多次,当然你也可以根据自己的业务逻辑来完成这个统计)最终的输出如下,只贴出来一部分
category count
CC204316 1
CC204311 3
CC404115 4
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/*
* 交叉关联推荐:CC——CC
* 用于订单成交后向上交叉推荐
* Step1:生成频繁1项集
* @author jianting.zhao
* main函数就是驱动函数,固定的写法,in是输入文件路径,out是输出结果路径
*
*
*/
public class CrossRecommendStep1 {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: Data Deduplication <in> <out> ");
System.exit(2);
}
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path(otherArgs[1]);
fs.deleteOnExit(outPath);
Job job = new Job(conf, "CrossRecommendStep1");
job.setJarByClass(CrossRecommendStep1.class);
job.setMapperClass(CrossRecommendStep1Map.class);
job.setReducerClass(CrossRecommendStep1Reduce.class);
//设置输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(10);
//设置输入及输出文件格式
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
//设置输入和输出目录
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, outPath);
job.waitForCompletion(true);
}
public static class CrossRecommendStep1Map extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String[] line = value.toString().split("\t");
if (line.length == 3) {
String accessTime = line[0];
String mem_guid = line[1];
String category = line[2];
String[] temp = category.split(",");
for (int k = 0; k < temp.length; k++) {
if (temp[k] == null) {
continue;
}
context.write(new Text(temp[k]), one);
}
}
}
}
public static class CrossRecommendStep1Reduce extends Reducer<Text, IntWritable, Text, Text> {
@Override
protected void reduce(Text key, Iterable<IntWritable> value, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable n : value) {
sum += n.get();
}
context.write(key, new Text(String.valueOf(sum)));
}
}
}
第二个map-reduce类:CrossRecommendStep2,生成频繁二项集,输入文件为order_wash.txt,和CrossRecommendStep1的输出,代码中有注释,
不再做另外的讲解
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* 生成频繁2项集
*最终的输出格式是
*ccA ccB support confidence
*CC204311 CC204314 8 0.8
*
*support是支持度,是ccA=>ccB的总次数,confidence为置信度support/ccA总的购买次数
* @author jianting.zhao
*/
public class CrossRecommendStep2 {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 3) {
System.err.println("Usage: Data Deduplication <in> <in> <out> ");
System.exit(2);
}
FileSystem fs = FileSystem.get(conf);
Path freqset1 = new Path(otherArgs[1]);
FileStatus[] user_stat = fs.listStatus(freqset1);
for (FileStatus f : user_stat) {//缓存上一步的输出
if (f.getPath().getName().indexOf("_SUCCESS") == -1 && f.isFile()) {
DistributedCache.addCacheFile(f.getPath().toUri(), conf);
}
}
if (fs.exists(new Path(otherArgs[2]))) {
fs.delete(new Path(otherArgs[2]),true);
}
Job job = new Job(conf, "CrossRecommendStep2");
job.setJarByClass(CrossRecommendStep2.class);
job.setMapperClass(CrossRecommendStep2Map.class);
job.setReducerClass(CrossRecommendStep2Reduce.class);
//设置输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(1);
//设置输入和输出目录
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
job.waitForCompletion(true);
}
public static class CrossRecommendStep2Map extends Mapper<Object, Text, Text, IntWritable> {
HashMap<String, Integer> ctg_count = new HashMap<String, Integer>();
private static final IntWritable one = new IntWritable(1);
/*
* 加载频繁1项集
* 分类下购买次数大于10的构建频繁一项集,存储到HashMap<String, Integer> ctg_count中
*/
protected void setup(Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
Path[] file = DistributedCache.getLocalCacheFiles(conf);
FileSystem fs = FileSystem.getLocal(conf);
String line = null;
for (Path path : file) {
BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path)));
while ((line = reader.readLine()) != null) {
String[] tmp = line.split("\t");
if (tmp.length != 2) {
continue;
}
String ctg = tmp[0];
int num = Integer.parseInt(tmp[1]);
if (num >= 10) {
ctg_count.put(ctg, num);
}
}
}
}
/**
* 本函数的输出是原始文件,根据频繁一项集,生成ccA=>ccB和ccB=>ccA关联交叉关系
*/
@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String[] line = value.toString().split("\t");
if (line.length == 3) {
String stg_set = line[2];
List<String> order_list = new ArrayList<String>();
String[] tmp = stg_set.split(",");
//剔除不满足频繁一项集的CC
for (int k = 0; k < tmp.length; k++) {
if (tmp[k] == null) {
continue;
}
if (ctg_count.get(tmp[k]) != null) {
order_list.add(tmp[k]);
}
}
//构建频繁2项集
for (int i = 0; i < order_list.size(); i++) {
String ccA = order_list.get(i);
for (int j = i + 1; j < order_list.size(); j++) {
String ccB = order_list.get(j);
context.write(new Text(ccA + ":" + ccB), one);
context.write(new Text(ccB + ":" + ccA), one);
}
}
}
}
}
public static class CrossRecommendStep2Reduce extends Reducer<Text, IntWritable, Text, Text> {
HashMap<String, Integer> ctg_count = new HashMap<String, Integer>();
DecimalFormat df = new DecimalFormat("0.00");
/*
* 加蒌频繁1项集
*/
protected void setup(Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
Path[] file = DistributedCache.getLocalCacheFiles(conf);
FileSystem fs = FileSystem.getLocal(conf);
String line = null;
for (Path path : file) {
BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path)));
while ((line = reader.readLine()) != null) {
String[] tmp = line.split("\t");
if (tmp.length != 2) {
continue;
}
String ctg = tmp[0];
int num = Integer.parseInt(tmp[1]);
if (num >= 10) {
ctg_count.put(ctg, num);
}
}
}
}
/*
* 计算2项集的支持度和置信度
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> value, Context context)
throws IOException, InterruptedException {
String[] line = key.toString().split(":");
String ccA = line[0];
String ccB = line[1];
int ccA_num = ctg_count.get(ccA);
int sum = 0;
for (IntWritable n : value) {
sum += n.get();
}
//支持度
int support = sum;
//置信度
double confidence = (double) sum / ccA_num;
StringBuffer sb = new StringBuffer();
sb.append(support).append("\t").append(df.format(confidence));
if (confidence > 0.0) {
context.write(new Text(ccA + "\t" + ccB), new Text(sb.toString()));
}
}
}
}
第三个map-reduce类:CrossRecommendStep3,用常购清单给出推荐结果,致于常购清单的生成不在本文讨论范畴.
推荐的整本思想是,当ccA=>ccB高于某个阀值时,用ccB的常购商品作为ccA的推荐结果
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* 根据关联类目,推荐常购清单商品
*此处推荐用到了分类下的常购清单
*推荐的整本思想是,当ccA=>ccB高于某个阀值时,用ccB的常购商品作为ccA的推荐结果
* @author jianting.zhao
*/
public class CrossRecommendStep3 {
@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 3) {
System.err.println("Usage: Data Deduplication <in> <in> <out> ");
System.exit(2);
}
//加载常购清单
FileSystem fs = FileSystem.get(conf);
Path freqset1 = new Path(otherArgs[1]);
FileStatus[] user_stat = fs.listStatus(freqset1);
for (FileStatus f : user_stat) {
if (f.getPath().getName().indexOf("_SUCCESS") == -1) {
DistributedCache.addCacheFile(f.getPath().toUri(), conf);
}
}
if (fs.exists(new Path(otherArgs[2]))) {
fs.delete(new Path(otherArgs[2]),true);
}
Job job = new Job(conf, "CrossRecommendStep3");
job.setJarByClass(CrossRecommendStep3.class);
job.setMapperClass(CrossRecommendStep3Map.class);
job.setReducerClass(CrossRecommendStep3Reduce.class);
//设置输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(1);
//设置输入和输出目录
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
job.waitForCompletion(true);
}
public static class CrossRecommendStep3Map extends Mapper<Object, Text, Text, Text> {
/*
* 加载频繁二项集
*/
@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String[] category = key.toString().split("\t");
String[] line = value.toString().split("\t");
if (category.length == 2) {
String ctg1 = category[0];
String ctg_rec = category[1];
/*
* 这段代码是限制关联度高于多少时才用来推荐
*/
/*double support = Double.parseDouble(line[0]);
double confidence = Double.parseDouble(line[0]);
if(support < 100 || confidence< 0.8){
return;
}
*/
if (ctg1 != null && ctg_rec != null) {
context.write(new Text(ctg1), new Text(ctg_rec));
context.write(new Text(ctg_rec), new Text(ctg1));
}
}
}
}
public static class CrossRecommendStep3Reduce extends Reducer<Text, Text, Text, Text> {
Map<String, ArrayList<String>> ctg_often_items = new HashMap<String, ArrayList<String>>();
/*
*加载常购清单
*/
protected void setup(Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
Path[] file = DistributedCache.getLocalCacheFiles(conf);
FileSystem fs = FileSystem.getLocal(conf);
String line = null;
for (Path path : file) {
BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path)));
while ((line = reader.readLine()) != null) {
String[] tmp = line.split("\t");
if (tmp.length != 3) {
continue;
}
String ctg = tmp[0];
String item_id = tmp[1];
ArrayList<String> often_items = ctg_often_items.get(ctg);
if (often_items == null || often_items.isEmpty()) {
often_items = new ArrayList<String>();
ctg_often_items.put(ctg, often_items);
}
if (often_items.size() < 20) {
often_items.add(item_id);
}
}
}
}
@Override
protected void reduce(Text key, Iterable<Text> value, Context context)
throws IOException, InterruptedException {
ArrayList<String> recommend_list = new ArrayList<String>();
//用常购清单商品给出推荐
if (recommend_list.size() < 50) {
int completion_num = 50 - recommend_list.size();
int temp = 0;
int size = 0;
while (completion_num > 0 && temp <= size) {
if (recommend_list.size() > 50) {
break;
}
for (Text val : value) {
if (recommend_list.size() > 50) {
break;
}
String rec_ctg = val.toString();
if (ctg_often_items.get(rec_ctg) == null) {
continue;
}
ArrayList<String> offen_items = ctg_often_items.get(rec_ctg);
size = offen_items.size() - 1;
if (temp >= offen_items.size()) {
continue;
}
String comple_itemid = offen_items.get(temp);
if (!recommend_list.contains(comple_itemid)) {
recommend_list.add(comple_itemid);
completion_num--;
}
}
temp++;
}
}
if (recommend_list.size() > 0) {
context.write(key, new Text(recommend_list.toString()));
}
}
}
}