Document Distance
-
Document = sequence of words
-Ignore punctuation & formatting
-
Word = sequence of alphanumeric characters
-
How to define "distance"?
-
Idea: focus on shared words
-
Word frequencies:
- D(w) = # occurrences of word w in document D
- Vector Space model
In [231]:
# Initial version of document distance
# This program computes the "distance" between two text file
# as the angle between their word frequency vectors (in radians).
# For each input file, a word-frequency vector is computed as follows:
# (1) the specified file is read in
# (2) it is converted into a list of alphanumeric "words"
# Here a "word" is a sequence of consecutive alphanumeric
# characters. Non-alphanumeric characters are treated as blanks.
# Case is not significant.
# (3) for each word, its frequency of occurrenc is determined
# (4) the word/frequency lists are sorted into order alphabetically
# The "distance" between two vectors is the angle between them.
# If x = (x1, x2, ..., xn) is the first vector (xi = freq of word i)
# and y = (y1, y2, ..., yn) is the second vector,
# then the angle between them is defined as:
# d(x, y) = arccos(inner_product(x, y) / (norm(x)*norm(y)))
# where:
# inner_product(x, y) = x1*y1 + x2*y2 + ... + xn*yn
# norm(x) = sqrt(inner_product(x, x))
In [232]:
import math
import sys
In [233]:
#################################
# Operation 1: read a text file##
#################################
def read_file(filename):
'''
Read the text file with the given filename;
return a list of the lines of text in the file.
'''
try:
f = open(filename, 'r')
return f.readlines()
except IOError:
print "Error opening or reading input file: ", filename
sys.exit()
In [234]:
#################################################
# Operation 2: split the text lines into words ##
#################################################
def get_words_from_line_list(L):
'''
Parse the given list L of text lines into words.
Return list of all words found.
'''
word_list = []
for line in L:
words_in_line = get_words_from_string(line)
word_list = word_list + words_in_line
return word_list
def get_words_from_string(line):
'''
Return a list of the words in the given input string,
converting each word to lower-case.
Input: line(a string)
Output: a list of strings (each string is a sequence of alphanumeric characters)
'''
word_list = [] # accumulates words in line
character_list = [] # accumulates characters in word
for c in line:
if c.isalnum():
character_list.append(c)
elif len(character_list) > 0:
word = "".join(character_list)
word = word.lower()
word_list.append(word)
character_list = []
if len(character_list) > 0:
word = "".join(character_list)
word = word.lower()
word_list.append(word)
return word_list
In [235]:
# test get_words_from_string
s = "This is a test String!"
word_list = get_words_from_string(s)
print word_list
In [236]:
# test get_words_from_line_list
L_test = []
L_test.append("Parse the given list L of text lines into words.")
L_test.append("Return list of all words found.")
L_test.append("get_words_from_line_list")
word_list = get_words_from_line_list(L_test)
word_list
Out[236]:
In [237]:
##############################################
# Operation 3: count frequency of each word ##
##############################################
def count_frequency(word_list):
'''
Return a list giving pairs of form: (word, frequency)
'''
L = []
for new_word in word_list:
for entry in L:
if new_word == entry[0]:
entry[1] = entry[1] + 1
break
else:
L.append([new_word,1])
return L
In [238]:
# test count_frequency
count_frequency(word_list)
Out[238]:
In [239]:
###################################################
# Operation 4: sort words into alphabetic order ##
###################################################
def insertion_sort(A):
'''
Sort list A into order, in place
'''
for j in range(len(A)):
key = A[j]
i = j - 1
while i > -1 and A[i] > key:
A[i + 1] = A[i]
i = i - 1
A[i + 1] = key
return A
In [240]:
#########################################################
# Operation 5: compute word frequencies for input file ##
#########################################################
def word_frequencies_for_file(filename):
'''
Return alphabetically sorted list of (word, frequency) pairs
for the given file
'''
line_list = read_file(filename)
word_list = get_words_from_line_list(line_list)
freq_mapping = count_frequency(word_list)
insertion_sort(freq_mapping)
print "File ", filename, ": "
print len(line_list), "lines, ",
print len(word_list), "words, ",
print len(freq_mapping), "distinct words"
return freq_mapping
In [241]:
# test word_frequencies_for_file
file_name = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t1.verne.txt"
freq_mapping = word_frequencies_for_file(file_name)
In [242]:
def inner_product(L1, L2):
'''
Inner product between two vectors, where vectors
are represented as alphabetically sorted (word, freq) pairs.
Example: inner_product([["and",3],["of",2],["the",5]],
[["and",4],["in",1],["of",1],["this",2]]) = 14.0
'''
sum = 0.0
i = 0
j = 0
while i < len(L1) and j < len(L2):
# L1[i:] and L[j:] yet to be processed
if L1[i][0] == L2[j][0]:
# both vectors have this word
sum += L1[i][1] * L2[j][1]
i += 1
j += 1
elif L1[i][0] < L2[j][0]:
# word L1[i][0] is in L1 but not L2
i += 1
else:
# word L2[j][0] is in L2 but not L1
j += 1
return sum
In [243]:
def vector_angle(L1, L2):
'''
The input is a list of (word, freq) pairs, sorted alphabetically.
Return the angle between these two vectors.
'''
numerator = inner_product(L1, L2)
denominator = math.sqrt(inner_product(L1, L1) * inner_product(L2, L2))
return math.acos(numerator/denominator)
#return math.acos(numerator / float(denominator))
In [244]:
# document distance version 1 test
def test_docdist_1():
filename_1 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t1.verne.txt"
filename_2 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t2.bobsey.txt"
sorted_word_list_1 = word_frequencies_for_file(filename_1)
sorted_word_list_2 = word_frequencies_for_file(filename_2)
distance = vector_angle(sorted_word_list_1,sorted_word_list_2)
print "The distance between the documents is: %0.6f (radians)" % distance
test_docdist_1()
In [245]:
# document distance version 2
# add profiling
import profile
profile.run("test_docdist_1()")
In [246]:
# document distance version 3
# replace + with extend
# for list operation A + B costs O(|A| + |B|)
# for list operation A.extend(B) costs O(|B|)
In [247]:
def get_words_from_line_list_3(L):
'''
Parse the given list L of text lines into words.
Return list of all words found.
'''
word_list = []
for line in L:
words_in_line = get_words_from_string(line)
# Using "extend" is much more efficient than concatenation here:
word_list.extend(words_in_line)
return word_list
In [248]:
def word_frequencies_for_file_3(filename):
"""
Return alphabetically sorted list of (word,frequency) pairs
for the given file.
"""
line_list = read_file(filename)
word_list = get_words_from_line_list_3(line_list)
freq_mapping = count_frequency(word_list)
insertion_sort(freq_mapping)
print "File",filename,":",
print len(line_list),"lines,",
print len(word_list),"words,",
print len(freq_mapping),"distinct words"
return freq_mapping
In [249]:
# document distance version 3 test
def test_docdist_3():
filename_1 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t1.verne.txt"
filename_2 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t2.bobsey.txt"
sorted_word_list_1 = word_frequencies_for_file_3(filename_1)
sorted_word_list_2 = word_frequencies_for_file_3(filename_2)
distance = vector_angle(sorted_word_list_1,sorted_word_list_2)
print "The distance between the documents is: %0.6f (radians)" % distance
test_docdist_3()
In [250]:
profile.run("test_docdist_3()")
In [251]:
# document distance version 4
# count frequencies using dictionary
def count_frequency_4(word_list):
'''
Return a list giving pairs of form: (word, frequency)
'''
D = {}
for new_word in word_list:
if new_word in D:
D[new_word] = D[new_word] + 1
else:
D[new_word] = 1
return D.items()
In [252]:
def word_frequencies_for_file_4(filename):
"""
Return alphabetically sorted list of (word,frequency) pairs
for the given file.
"""
line_list = read_file(filename)
word_list = get_words_from_line_list_3(line_list)
freq_mapping = count_frequency_4(word_list)
insertion_sort(freq_mapping)
print "File",filename,":",
print len(line_list),"lines,",
print len(word_list),"words,",
print len(freq_mapping),"distinct words"
return freq_mapping
In [253]:
# document distance version 4 test
def test_docdist_4():
filename_1 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t1.verne.txt"
filename_2 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t2.bobsey.txt"
sorted_word_list_1 = word_frequencies_for_file_4(filename_1)
sorted_word_list_2 = word_frequencies_for_file_4(filename_2)
distance = vector_angle(sorted_word_list_1,sorted_word_list_2)
print "The distance between the documents is: %0.6f (radians)" % distance
test_docdist_4()
In [254]:
profile.run("test_docdist_4()")
In [255]:
# document distance version 5
# split words with string.translate
In [256]:
import string
In [257]:
# global variables needed for fast parsing
# translation table maps upper case to lower case and punctuation to spaces
translation_table = string.maketrans(string.punctuation + string.uppercase,
" " * len(string.punctuation) + string.lowercase)
In [258]:
def get_words_from_line_list_5(L):
'''
Parse the given list L of text lines into words.
Return list of all words found.
'''
word_list = []
for line in L:
words_in_line = get_words_from_string_5(line)
# Using "extend" is much more efficient than concatenation here:
word_list.extend(words_in_line)
return word_list
def get_words_from_string_5(line):
'''
Return a list of words in the given input string,
converting each word to lower-case.
Input: line (a string)
Output: a list of strings
(each string is a sequence of alphanumeric characters)
'''
line = line.translate(translation_table)
word_list = line.split()
return word_list
In [259]:
def word_frequencies_for_file_5(filename):
"""
Return alphabetically sorted list of (word,frequency) pairs
for the given file.
"""
line_list = read_file(filename)
word_list = get_words_from_line_list_5(line_list)
freq_mapping = count_frequency_4(word_list)
insertion_sort(freq_mapping)
print "File",filename,":",
print len(line_list),"lines,",
print len(word_list),"words,",
print len(freq_mapping),"distinct words"
return freq_mapping
In [260]:
# document distance version 5 test
def test_docdist_5():
filename_1 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t1.verne.txt"
filename_2 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t2.bobsey.txt"
sorted_word_list_1 = word_frequencies_for_file_5(filename_1)
sorted_word_list_2 = word_frequencies_for_file_5(filename_2)
distance = vector_angle(sorted_word_list_1,sorted_word_list_2)
print "The distance between the documents is: %0.6f (radians)" % distance
test_docdist_5()
In [284]:
profile.run("test_docdist_5()")
In [262]:
# document distance version 6
# change insertionsort to merge sort
In [263]:
def merge_sort(A):
"""
Sort list A into order, and return result.
"""
n = len(A)
if n==1:
return A
mid = n//2 # floor division
L = merge_sort(A[:mid])
R = merge_sort(A[mid:])
return merge(L,R)
def merge(L,R):
"""
Given two sorted sequences L and R, return their merge.
"""
i = 0
j = 0
answer = []
while i<len(L) and j<len(R):
if L[i]<R[j]:
answer.append(L[i])
i += 1
else:
answer.append(R[j])
j += 1
if i<len(L):
answer.extend(L[i:])
if j<len(R):
answer.extend(R[j:])
return answer
In [264]:
# test merge_sort
merge_result = merge_sort([1, 81, 65, 68, 34, 21, 10, 7, 9])
print merge_result
In [265]:
insertion_sort([1, 81, 65, 68, 34, 21, 10, 7, 9])
Out[265]:
In [266]:
def count_frequency_6(word_list):
"""
Return a list giving pairs of form: (word,frequency)
"""
D = {}
for new_word in word_list:
if new_word in D:
D[new_word] = D[new_word]+1
else:
D[new_word] = 1
return D.items()
In [267]:
def word_frequencies_for_file_6(filename):
"""
Return alphabetically sorted list of (word,frequency) pairs
for the given file.
"""
#line_list = read_file(filename)
#word_list = get_words_from_line_list_5(line_list)
#freq_mapping = count_frequency_4(word_list)
#merge_sort(freq_mapping)
line_list = read_file(filename)
word_list = get_words_from_line_list(line_list)
freq_mapping = count_frequency_6(word_list)
freq_mapping = merge_sort(freq_mapping)
print "File",filename,":",
print len(line_list),"lines,",
print len(word_list),"words,",
print len(freq_mapping),"distinct words"
return freq_mapping
In [268]:
# document distance version 6 test
def test_docdist_6():
filename_1 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t1.verne.txt"
filename_2 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t2.bobsey.txt"
sorted_word_list_1 = word_frequencies_for_file_6(filename_1)
sorted_word_list_2 = word_frequencies_for_file_6(filename_2)
distance = vector_angle(sorted_word_list_1,sorted_word_list_2)
print "The distance between the documents is: %0.6f (radians)" % distance
test_docdist_6()
In [269]:
profile.run("test_docdist_6()")
In [270]:
# document distance version 7
# no sorting,dot product with dictionary
In [271]:
def count_frequency_7(word_list):
'''
Return a dictionary mapping words to frequency.
'''
D = {}
for new_word in word_list:
if new_word in D:
D[new_word] += 1
else:
D[new_word] = 1
return D
In [272]:
def word_frequencies_for_file_7(filename):
"""
Return alphabetically sorted list of (word,frequency) pairs
for the given file.
"""
line_list = read_file(filename)
word_list = get_words_from_line_list_5(line_list)
freq_mapping = count_frequency_7(word_list)
print "File",filename,":",
print len(line_list),"lines,",
print len(word_list),"words,",
print len(freq_mapping),"distinct words"
return freq_mapping
In [273]:
def inner_product_7(D1, D2):
'''
Inner product between two vectors, where vectors are
represented as dictionaries of (word, freq) pairs.
Example: inner_product_7({"and":3,"of":2,"the":5},
{"and":4,"in":1,"of":1,"this":2}) = 14.0
'''
sum = 0.0
for key in D1:
if key in D2:
sum += D1[key] * D2[key]
return sum
In [274]:
# test inner_product_7
inner_product_7({"and":3,"of":2,"the":5},
{"and":4,"in":1,"of":1,"this":2})
Out[274]:
In [275]:
def vector_angle_7(L1, L2):
'''
The input is a list of (word, freq) pairs, sorted alphabetically.
Return the angle between these two vectors.
'''
numerator = inner_product_7(L1, L2)
denominator = math.sqrt(inner_product_7(L1, L1) * inner_product_7(L2, L2))
return math.acos(numerator / denominator)
In [276]:
# document distance version 6 test
def test_docdist_7():
filename_1 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t1.verne.txt"
filename_2 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t2.bobsey.txt"
sorted_word_list_1 = word_frequencies_for_file_7(filename_1)
sorted_word_list_2 = word_frequencies_for_file_7(filename_2)
distance = vector_angle_7(sorted_word_list_1,sorted_word_list_2)
print "The distance between the documents is: %0.6f (radians)" % distance
#test_docdist_7()
In [277]:
profile.run("test_docdist_7()")
In [278]:
# document distance version 8
# split words on whole document, not line by line.
In [279]:
def read_file_8(filename):
"""
Read the text file with the given filename;
return a list of the lines of text in the file.
"""
try:
f = open(filename, 'r')
return f.read()
except IOError:
print "Error opening or reading input file: ",filename
sys.exit()
In [280]:
def get_words_from_line_list_8(text):
'''
Parse the given text into words.
Return list of all words found.
'''
text = text.translate(translation_table)
word_list = text.split()
return word_list
In [281]:
def word_frequencies_for_file_8(filename):
"""
Return alphabetically sorted list of (word,frequency) pairs
for the given file.
"""
line_list = read_file_8(filename)
word_list = get_words_from_line_list_8(line_list)
freq_mapping = count_frequency_7(word_list)
print "File",filename,":",
print len(line_list),"lines,",
print len(word_list),"words,",
print len(freq_mapping),"distinct words"
return freq_mapping
In [282]:
# document distance version 6 test
def test_docdist_8():
filename_1 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t1.verne.txt"
filename_2 = "/home/will/myspace/mydev/mytest/sparktest/pyspark/6006/unit1/lec01_data/t2.bobsey.txt"
sorted_word_list_1 = word_frequencies_for_file_8(filename_1)
sorted_word_list_2 = word_frequencies_for_file_8(filename_2)
distance = vector_angle_7(sorted_word_list_1,sorted_word_list_2)
print "The distance between the documents is: %0.6f (radians)" % distance
#test_docdist_8()
In [283]:
profile.run("test_docdist_8()")

本文介绍了一种计算两份文档间距离的方法,通过将其转化为词频向量间的角度来衡量相似度。具体步骤包括读取文件、提取单词、计算频率及求向量夹角。
2259

被折叠的 条评论
为什么被折叠?



