#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on 2015-1-19
@author: beyondzhou
@name: explore_google_tfidf.py
'''
# Querying Google+ data with TF-IDF
import json
import nltk
# Load in human language data from wherever you've saved it
DATA = r'E:\eclipse\Google\dFile\107033731246200681024.json'
data = json.loads(open(DATA).read())
# Provide your own query terms here
QUERY_TERMS = ['best']
activities = [activity['object']['content'].lower().split() \
for activity in data \
if activity['object']['content'] != ""]
#print activities,
# TextCollection provides tf, idf, and tf_idf abstractions so
# that we don't have to maintan/compute them ourselves
tc = nltk.TextCollection(activities)
relevant_activities = []
for idx in range(len(activities)):
#print 'idx:', idx
score = 0
for term in [t.lower() for t in QUERY_TERMS]:
#print 'term:', term
#print 'activities[idx]:', activities[idx]
score += tc.tf_idf(term, activities[idx])
if score > 0:
relevant_activities.append({'score':score, 'title':data[idx]['title'], 'url':data[idx]['url']})
#print relevant_activities,
# Sort by score and display results
relevant_activities = sorted(relevant_activities, key=lambda p: p['score'], reverse=True)
for activity in relevant_activities:
print activity['title']
print '\tLink: %s' % (activity['url'], )
print '\tScore: %s' % (activity['score'], )
print
Now on Medium--the Best of O'Reilly Radar: http://bit.ly/133U4wb Our latest thinking on the big ideas...
Link: https://plus.google.com/107033731246200681024/posts/LzTHAvJsDZ9
Score: 0.142631571496
The best definition of Freudian psychoanalysis I've ever seen, from poet W.H. Auden:
"...he merely ...
Link: https://plus.google.com/107033731246200681024/posts/ZE3cDmqLXnN
Score: 0.0413424844915
Can We Use Data to Make Better Regulations?
Evgeny Morozov either misunderstands or misrepresents the...
Link: https://plus.google.com/107033731246200681024/posts/gboAUahQwuZ
Score: 0.0156165954192