一.此前的数据集
movies.dat
1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
...
ratings.dat
1::1193::5::978300760
1::661::3::978302109
...
users.dat 记录用户的属性,不需要
1::F::1::10::48067
2::M::56::16::70072
...
二.当前的含有信任矩阵的数据集,初始信任很稀疏,不包含项目名字
ratings_data.txt
1 100 4
1 101 5
...
trust_data.txt
22605 42915 1
22605 5052 1
...
三.Python代码,初步版本,试用数据集1
from math import sqrt
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
# Get the list of shared_items
si={}
for item in prefs[person1]:
if item in prefs[person2]: si[item]=1
# if they have no ratings in common, return 0
if len(si)==0: return 0
# Add up the squares of all the differences
sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)
for item in prefs[person1] if item in prefs[person2]])
return 1/(1+sum_of_squares)
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
# Get the list of mutually rated items
si={}
for item in prefs[p1]:
if item in prefs[p2]: si[item]=1
# if they are no ratings in common, return 0
if len(si)==0: return 0
# Sum calculations
n=len(si)
# Sums of all the preferences
sum1=sum([prefs[p1][it] for it in si])
sum2=sum([prefs[p2][it] for it in si])
# Sums of the squares
sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
# Sum of the products
pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
# Calculate r (Pearson score)
num=pSum-(sum1*sum2/n)
den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
if den==0: return 0
r=num/den
return r
# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,td,person,n=10,similarity=sim_pearson):
#scores=[(similarity(prefs,person,other),other)
# for other in prefs if other!=person]
scores=[]
for other in prefs:
if other==person: continue
if other in td[person]:
sim=0.7*similarity(prefs,person,other) + 0.3*td[person][other]
else:
sim=0.7*similarity(prefs,person,other)
scores.append((sim, other)) # save as tuple @shaojh
scores.sort()
scores.reverse()
return scores[0:n]
# do sth to the trust metric
def trustPropagate(td):
for p1 in td:
for p2 in td:
if td[p1][p2] == 1: continue
for p3 in td:
if td[p2][p3] == 1:
td[p1][p3] = 0.5 # trust reduction O(n^3)
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,td,person,similarity=sim_pearson):
totals={}
simSums={}
# calculate person's everage rating
personSum=0
personEverage=0
for item in prefs[person]:
personSum+=prefs[person][item]
personEverage=personSum/len(prefs[person])
print 'personEverage=',personEverage
scores=topMatches(prefs, td, person)
#for other in prefs:
for sim,other in scores:
print ' ',sim,' ',other
# don't compare me to myself
#sim=similarity(prefs,person,other)
# ignore scores of zero or lower
#if sim<=0: continue
# calculate other's everage rating
otherSum=0
otherEverage=0
for item in prefs[other]:
otherSum+=prefs[other][item]
otherEverage=otherSum/len(prefs[other])
print otherEverage
for item in prefs[other]:
# only score movies I haven't seen yet
if item not in prefs[person] or prefs[person][item]==0:
# Similarity * Score
totals.setdefault(item,0)
totals[item]+=(prefs[other][item]-otherEverage)*sim
# Sum of similarities
simSums.setdefault(item,0)
simSums[item]+=sim
# Create the normalized list
rankings=[(personEverage+total/simSums[item],item) for item,total in totals.items()]
# Return the sorted list
rankings.sort()
rankings.reverse()
return rankings[0:10]
# Input data
def loadRatings(path='C:/data'):
#Load data
prefs={}
for line in open(path+'/ratings_data.dat'):
(user,movieid,rating)=line.split(' ')[0:3]
#print type(rating),rating[0:1],user,movieid
prefs.setdefault(user,{})
prefs[user][movieid]=float(rating[0])
return prefs
def loadTrustData(path='C:/data'):
# Load data
td={}
for line in open(path+'/trust_data.dat'):
(tmp,user1,user2,rating)=line.split(' ')[0:4]
td.setdefault(user1,{})
td[user1][user2]=float(rating[0])
return td
def test(prefs):
for i in prefs['1']:
print i
print len(prefs['1'])