第三步,就是准备协同矩阵与用户向量相乘的过程了
//协同矩阵与用户向量相乘
//start the multiplication of the co-occurrence matrix by the user vectors
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
//第一个MapReducer
Job prePartialMultiply1 = prepareJob(
similarityMatrixPath, prePartialMultiplyPath1, SequenceFileInputFormat.class,
SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class, VectorOrPrefWritable.class,
Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
SequenceFileOutputFormat.class);
boolean succeeded = prePartialMultiply1.waitForCompletion(true);
if (!succeeded)
return -1;
//第二个MapReduce
//continue the multiplication
Job prePartialMultiply2 = prepareJob(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
prePartialMultiplyPath2, SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class,
VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
SequenceFileOutputFormat.class);
if (usersFile != null) {
prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile);
}
prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED,
maxPrefsPerUser);
succeeded = prePartialMultiply2.waitForCompletion(true);
if (!succeeded)
return -1;
//finish the job
//第三个MapReduce
Job partialMultiply = prepareJob(
new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2), partialMultiplyPath,
SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class, VectorOrPrefWritable.class,
ToVectorAndPrefReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
SequenceFileOutputFormat.class);
setS3SafeCombinedInputPath(partialMultiply, getTempPath(), prePartialMultiplyPath1, prePartialMultiplyPath2);
succeeded = partialMultiply.waitForCompletion(true);
if (!succeeded)
return -1;
}
下边也是同样分析一下这个三个MapReduce的细节:
1、Mapper: SimilarityMatrixRowWrapperMapper 类,将协同矩阵的一行拿出来,通过包装,封装成VectorOrPrefWritable类,与那边的UserVectorSplitterMapper 的输出类型一致
public final class SimilarityMatrixRowWrapperMapper extends
Mapper<IntWritable,VectorWritable,VarIntWritable,VectorOrPrefWritable> {
//将协同矩阵的一行拿出来,通过包装,封装成VectorOrPrefWritable类,与那边的UserVectorSplitterMapper
//的输出类型一致
@Override
protected void map(IntWritable key,
VectorWritable value,
Context context) throws IOException, InterruptedException {
Vector similarityMatrixRow = value.get();
/* remove self similarity */
similarityMatrixRow.set(key.get(), Double.NaN);
context.write(new VarIntWritable(key.get()), new VectorOrPrefWritable(similarityMatrixRow));
}
}
2、Mapper:UserVectorSplitterMapper类
//输入格式: theUserID:<itemid_index1,pref1>,<itemid_index2,pref2>........<itemid_indexN,prefN>
//输出格式: itemid1:<theUserID,pref1>
// itemid2:<theUserID,pref2>
// itemid3:<theUserID,pref3>
// ......
// itemidN:<theUserID,prefN>
public final class UserVectorSplitterMapper extends
Mapper<VarLongWritable,VectorWritable, VarIntWritable,VectorOrPrefWritable> {
@Override
protected void map(VarLongWritable key,
VectorWritable value,
Context context) throws IOException, InterruptedException {
long userID = key.get();
if (usersToRecommendFor != null && !usersToRecommendFor.contains(userID)) {
return;
}
Vector userVector = maybePruneUserVector(value.get());
Iterator<Vector.Element> it = userVector.iterateNonZero();
VarIntWritable itemIndexWritable = new VarIntWritable();
VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable();
while (it.hasNext()) {
Vector.Element e = it.next();
itemIndexWritable.set(e.index());
vectorOrPref.set(userID, (float) e.get());
context.write(itemIndexWritable, vectorOrPref);
}
}
3、Reduce:ToVectorAndPrefReducer类,收集协同矩阵为itemid的一行,并且收集评价过该item的用户和评分,最后的输出是 itemid_index,VectorAndPrefsWritable(vector,List<userid>,List<pref>)
public final class ToVectorAndPrefReducer extends
Reducer<VarIntWritable,VectorOrPrefWritable,VarIntWritable,VectorAndPrefsWritable> {
//收集所有key为itemid的
@Override
protected void reduce(VarIntWritable key,
Iterable<VectorOrPrefWritable> values,
Context context) throws IOException, InterruptedException {
List<Long> userIDs = Lists.newArrayList();
List<Float> prefValues = Lists.newArrayList();
Vector similarityMatrixColumn = null;
for (VectorOrPrefWritable value : values) {
if (value.getVector() == null) {
// Then this is a user-pref value
userIDs.add(value.getUserID());
prefValues.add(value.getValue());
} else {
// Then this is the column vector
//协同矩阵的一个行(行号为itemid的一行)
if (similarityMatrixColumn != null) {
throw new IllegalStateException("Found two similarity-matrix columns for item index " + key.get());
}
similarityMatrixColumn = value.getVector();
}
}
if (similarityMatrixColumn == null) {
return;
}
//收集协同矩阵为itemid的一行,并且手机评价过该item的用户和评分
VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable(similarityMatrixColumn, userIDs, prefValues);
context.write(key, vectorAndPrefs);
}
}
第四步,协同矩阵和用户向量相乘,得到推荐结果
//extract out the recommendations
Job aggregateAndRecommend = prepareJob(
new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class,
PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class,
AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class,
TextOutputFormat.class);
Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
Mapper:PartialMultiplyMapper类
//输入类型:( itemid_index, <userid的数组,pref的数组,协同矩阵行号为itemid_index的行> )
//输出类型: userid,<该用户对itemid_index1的评分,协同矩阵行号为itemid_index1的行> )
// userid,<该用户对itemid_index2的评分,协同矩阵行号为itemid_index2的行> )
// .....
// .....
// userid,<该用户对itemid_indexN的评分,协同矩阵行号为itemid_indexN的行> )
public final class PartialMultiplyMapper extends
Mapper<VarIntWritable,VectorAndPrefsWritable,VarLongWritable,PrefAndSimilarityColumnWritable> {
@Override
protected void map(VarIntWritable key,
VectorAndPrefsWritable vectorAndPrefsWritable,
Context context) throws IOException, InterruptedException {
Vector similarityMatrixColumn = vectorAndPrefsWritable.getVector();
List<Long> userIDs = vectorAndPrefsWritable.getUserIDs();
List<Float> prefValues = vectorAndPrefsWritable.getValues();
VarLongWritable userIDWritable = new VarLongWritable();
PrefAndSimilarityColumnWritable prefAndSimilarityColumn = new PrefAndSimilarityColumnWritable();
for (int i = 0; i < userIDs.size(); i++) {
long userID = userIDs.get(i);
float prefValue = prefValues.get(i);
if (!Float.isNaN(prefValue)) {
prefAndSimilarityColumn.set(prefValue, similarityMatrixColumn);
userIDWritable.set(userID);
context.write(userIDWritable, prefAndSimilarityColumn);
}
}
}
}
Reducer:AggregateAndRecommendReducer类,Reducer中进行PartialMultiply,按乘积得到的推荐度的大小取出最大的几个item。对于非booleanData,是用pref和相似度矩阵的PartialMultiply得到推荐度的值来进行排序。
而booleanData的pref值都是1.0f,所以去计算矩阵相乘的过程没有意义,直接累加相似度的值即可。
用这个数据排序就可得到推荐结果
public final class AggregateAndRecommendReducer extends
Reducer<VarLongWritable,PrefAndSimilarityColumnWritable,VarLongWritable,RecommendedItemsWritable> {
@Override
protected void reduce(VarLongWritable userID,
Iterable<PrefAndSimilarityColumnWritable> values,
Context context) throws IOException, InterruptedException {
if (booleanData) {
reduceBooleanData(userID, values, context);
} else {
reduceNonBooleanData(userID, values, context);
}
}
private void reduceBooleanData(VarLongWritable userID,
Iterable<PrefAndSimilarityColumnWritable> values,
Context context) throws IOException, InterruptedException {
/* having boolean data, each estimated preference can only be 1,
* however we can't use this to rank the recommended items,
* so we use the sum of similarities for that. */
Vector predictionVector = null;
for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) {
predictionVector = predictionVector == null
? prefAndSimilarityColumn.getSimilarityColumn()
: predictionVector.plus(prefAndSimilarityColumn.getSimilarityColumn());
}
writeRecommendedItems(userID, predictionVector, context);
}
private void reduceNonBooleanData(VarLongWritable userID,
Iterable<PrefAndSimilarityColumnWritable> values,
Context context) throws IOException, InterruptedException {
/* each entry here is the sum in the numerator of the prediction formula */
Vector numerators = null;
/* each entry here is the sum in the denominator of the prediction formula */
Vector denominators = null;
/* each entry here is the number of similar items used in the prediction formula */
Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) {
Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn();
float prefValue = prefAndSimilarityColumn.getPrefValue();
/* count the number of items used for each prediction */
Iterator<Vector.Element> usedItemsIterator = simColumn.iterateNonZero();
while (usedItemsIterator.hasNext()) {
int itemIDIndex = usedItemsIterator.next().index();
numberOfSimilarItemsUsed.setQuick(itemIDIndex, numberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1);
}
//vector.times(float) 是向量乘于一个数,也就是向量的每一个值都乘以这个数
//vector.plus(vector) 是两个向量相加,每一个位置上的值相加
//numerators是一个vecotr,每一个元素是这样的
/*
例如index为item1的元素的值为:
simility(item1, item_2)*pref(userid, item_2)
+ simility(item_1, item_3)*pref(userid, item_3)
+ simility(item1, item_4)*pref(userid, item_4)
+ ……
+ simility(item_1, item_2)*pref(userid, item_N)
*/
// 注:其中simility(item1, item2)代表物品item1和物品item2的相似度 ,pref(userid, item)代表用于userid对item打分分值
numerators = numerators == null
? prefValue == BOOLEAN_PREF_VALUE ? simColumn.clone() : simColumn.times(prefValue)
: numerators.plus(prefValue == BOOLEAN_PREF_VALUE ? simColumn : simColumn.times(prefValue));
simColumn.assign(ABSOLUTE_VALUES);
//denominators是一个vecotr,每一个元素是这样的
/*
例如index为item1的元素的值为:
simility(item1, item_2)+ simility(item_1, item_3)+ …… + simility(item_1, item_2)*pref(userid, item_N)
*/
// 注:其中simility(item1, item2)代表物品item1和物品item2的相似度
denominators = denominators == null ? simColumn : denominators.plus(simColumn);
}
if (numerators == null) {
return;
}
Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
Iterator<Vector.Element> iterator = numerators.iterateNonZero();
while (iterator.hasNext()) {
Vector.Element element = iterator.next();
int itemIDIndex = element.index();
/* preference estimations must be based on at least 2 datapoints */
if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) {
/* compute normalized prediction */
//计算归一化预测值
double prediction = element.get() / denominators.getQuick(itemIDIndex);
recommendationVector.setQuick(itemIDIndex, prediction);
}
}
writeRecommendedItems(userID, recommendationVector, context);
}
}