小菜鸟注册优快云账号好久了,最近才决定开始记录自己的学习经历。哈哈^_^
前几天学习了基于map-reduce的物品推荐算法的实现,写下来和大家分享。
首先,整段代码可分为5步:
step1.根据用户行为列表构建评分矩阵。
step2.利用评分矩阵,构建物品与物品的相似度矩阵。(在这里利用余弦相似度计算物品与物品的相似度矩阵)
多维向量的余弦相似度:
a(A1,A2,A3.....An)
b(B1,B2,B3,....Bn)
step3.将评分矩阵转置。
step4.物品与物品相似度矩阵*评分矩阵(经过步骤3转置)。
step5.根据评分矩阵,将步骤4的输出中,用户已经有过行为的商品评分置0 。
接下来,我们看看具体实现。
step1.根据用户行为列表构建评分矩阵。
输入矩阵每列代表:用户ID;物品ID;分值 输入文件路径为: /ItemCF/step1_input/ActionList.txt
eg:输入矩阵:
A,1,1
C,3,5
B,2,3
B,5,3
B,6,5
A,2,10
C,3,10
C,4,5
C,1,5
A,1,1
A,6,5
A,4,3<---------------step1的map阶段----------------->
package step1; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Mapper.Context; public class Mapper1 extends Mapper<LongWritable,Text,Text,Text>{ private Text outKey = new Text(); private Text outValue = new Text(); protected void map(LongWritable key,Text value,Mapper<LongWritable,Text,Text,Text>.Context context) throws IOException, InterruptedException{ try{ String[] values = value.toString().split(","); String userID = values[0]; String itemID = values[1]; String score = values[2]; outKey.set(itemID); outValue.set(userID+"_"+score); context.write(outKey, outValue); }catch(Exception e){ e.printStackTrace(); } } }
<---------------step1的reduce阶段----------------->
package step1; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Reducer.Context; public class Reducer1 extends Reducer<Text,Text,Text,Text> { private Text outKey = new Text(); private Text outValue = new Text(); protected void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{ try{ String itemID = key.toString(); //<userID,score> Map<String,Integer> map = new HashMap<String,Integer>(); for(Text value: values){ String userID = value.toString().split("_")[0]; String score = value.toString().split("_")[1]; if(map.get(userID) == null){ map.put(userID, Integer.valueOf(score)); }else{ Integer preScore = map.get(userID); map.put(userID, preScore+Integer.valueOf(score)); } } StringBuilder sBuilder = new StringBuilder(); for(Map.Entry<String, Integer> entry:map.entrySet()){ String userID = entry.getKey(); String score = String.valueOf(entry.getValue()); sBuilder.append(userID + "_" + score + ","); } String line = null; if(sBuilder.toString().endsWith(",")){ line = sBuilder.substring(0,sBuilder.length()-1); } outKey.set(itemID); outValue.set(line); context.write(outKey, outValue); }catch(Exception e){ e.printStackTrace(); } } }
<---------------step1的主函数----------------->
package step1; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class MR1 { private static String inputPath = "/ItemCF/step1_input/ActionList.txt"; private static String outputPath = "/ItemCF/step1_output/"; private static String hdfsPath = "hdfs://hadoop:9000"; public int run(){ try{ //创建job配置类 Configuration conf = new Configuration(); //设置hdfs地址 conf.set("fs.defaultFS",hdfsPath); //创建job实例 Job job = Job.getInstance(conf, "step1"); //添加分布式缓存文件 //job.addCacheArchive(new URI(cache + "matrix2")); //设置Job主类 job.setJarByClass(MR1.class); //设置Job的Mapper类和Reducer类 job.setMapperClass(Mapper1.class); job.setReducerClass(Reducer1.class); //设置Mapper的输出类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //设置Reducer的输出类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //设置输入和输出路径 FileSystem fs = FileSystem.get(conf); Path inPath = new Path(inputPath); if(fs.exists(inPath)){ FileInputFormat.addInputPath(job, inPath); } Path outPath = new Path(outputPath); fs.delete(outPath,true); FileOutputFormat.setOutputPath(job, outPath); return job.waitForCompletion(true)? 1:-1; }catch(IOException e){ e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } return 1; } public static void main(String[] args){ try{ int result = -1; result = new MR1().run(); System.out.println("result0 = "+result); if(result == 1){ System.out.println("result1 = "+result); System.out.println("success"); }else if(result == -1){ System.out.println("result2 = "+result); System.out.println("defeat"); } }catch(Exception e){ e.printStackTrace(); } } }
经step1阶段输出结果为:
输出矩阵每列代表: 物品ID(行);用户ID(列);分值 输出路径为 /ItemCF/step1_output/。
输出结果:
1 A_2,C_5
&nb