Random Text Generator:
Generator "meaningful" random English text according to some input sample files.
Use markov algorithm as the basic algorithm to implement this generator.
Simple Version:
#!/usr/bin/env python
'''
This module offers functions which parse an English text file
and generate a statistical model of the text.
'''
# simple parse function that parses a file and returns a dictionary
# containing the statistical model
def parse_simple(filepath, prefixnum=2):
'''
parse a file specified in filepath and return a dict which has
the form of {(p1, p2, .., pn):[s1, s2, ..., sm], ...}
'''
dict_stat_model={}
w1=w2=w3=None
try:
with open(filepath) as f:
# parse file line by line, word by word
for line in f:
for w3 in line.split():
dict_add(dict_stat_model,w1,w2,w3)
w1=w2
w2=w3
dict_add(dict_stat_model, w1, w2, None)
except IOError as e:
return None
return dict_stat_model
# helper functions
def dict_add(d, w1, w2, w3):
if (w1, w2) not in d:
d[(w1, w2)] = []
if w3 not in d[(w1, w2)]:
d[(w1, w2)].append(w3)
def print_dict(d):
for key in d:
print key, d[key]
print
# self-contained test functions
# return the number of failed test cases
def test_parse_simple():
failed_test_cases=0
empty_file_path='test/empty.txt'
short_file_path='test/short.txt'
long_file_path='test/long.txt'
invalid_file_path="INVALID_FILE_PATH"
# test with empty input file
dict_stat=parse_simple(empty_file_path)
if dict_stat==None: failed_test_cases += 1
print '[empty file]'
print_dict(dict_stat)
# test with short input file
dict_stat=parse_simple(short_file_path)
if dict_stat==None: failed_test_cases += 1
print '[short file]'
print_dict(dict_stat)
# test with long input file
dict_stat=parse_simple(long_file_path)
if dict_stat==None: failed_test_cases += 1
print '[long file]'
print_dict(dict_stat)
# test with invalid input file (not exist)
dict_stat=parse_simple(invalid_file_path)
if dict_stat!=None: failed_test_cases += 1
return failed_test_cases
# run tests
if __name__ == '__main__':
print '========== parse.py: self-contained tests start ====='
failed_test_cases=test_parse_simple()
print 'test_parse_simple: %s test cases failed' % failed_test_cases
print '========== parse.py: self-contained tests end ======='
#!/usr/bin/env python
'''
This program generates random English text based on the input file.
Usage: gen.py <inputfile>
'''
import sys
import parse
from random import choice
def gen_rand_text(filepath):
dict_stat=parse.parse_simple(filepath)
if dict_stat==None:
print 'parse % failed, maybe invalid file path' % filepath
return 1
# deal with dict_stat and output random text based on it
# parse.print_dict(dict_stat)
w1=w2=w3=None
while True:
w3=choice(dict_stat[(w1,w2)])
if w3 == None: break
if w1 != None: print w1,
w1=w2
w2=w3
if w1 != None:
print w1, w2
print
return 0
def usage():
print "Usage: gen.py <inputfile>"
if __name__=="__main__":
# check command arguments
if len(sys.argv) != 2:
usage()
sys.exit(1)
gen_rand_text(sys.argv[1])
More complete version:
#!/usr/bin/env python
'''
This module offers a class that represents the text model of
some input text files -- TextModel.
'''
from random import choice
import sys
class TextModel:
def __init__(self, prefix_len=2):
self.state_dict={}
self.prefix_list=[]
self.prefix_len=prefix_len
# prefix_len should be within the range of [1,5],
# otherwise, it's meaningless
assert self.prefix_len>=1
assert self.prefix_len<=5
def __str__(self):
return str(self.state_dict)
def add(self, prefix, suffix):
'''
add prefix,suffix to state_dict
'''
if prefix not in self.state_dict:
self.state_dict[prefix]=[suffix]
self.prefix_list.append(prefix)
else:
self.state_dict[prefix].append(suffix)
def getSuffixes(self, prefix):
return self.state_dict[prefix]
def getRandSuffix(self, prefix):
suffixes=self.getSuffixes(prefix)
return choice(suffixes)
def getRandPrefix(self):
return choice(self.prefix_list)
def parseFile(self, f):
'''
parse file to form internal data structures which stores the
statistical model of this text file
f should be a file object
'''
assert isinstance(f, file)
wlist=[]
for i in range(0, self.prefix_len):
wlist.append(None)
for line in f:
for w in line.split():
# add to dict
self.add(tuple(wlist), w)
# shift by one
wlist=wlist[1:]
wlist.append(w)
self.add(tuple(wlist), None)
def parseFiles(self, fList):
for f in fList:
self.parseFile(f)
# self-contained tests for this module
def testFileParsing(tm, argv):
if len(argv) == 0:
tm.parseFile(sys.stdin)
sys.stdin.close()
else:
fList=[]
for arg in argv:
fList.append(open(arg))
tm.parseFiles(fList)
for f in fList:
f.close()
def testRandPrefixAndSuffix(tm):
for i in range(0, 10):
prefix=tm.getRandPrefix()
suffix=tm.getRandSuffix(prefix)
print prefix, suffix
if __name__ == "__main__":
tm=TextModel()
testFileParsing(tm, sys.argv[1:])
print tm
testRandPrefixAndSuffix(tm)
#!/usr/bin/env python
'''
This module provides a class -- TextGenerator.
This class parses the a given text model and generate
output according to the model.
'''
import sys
from TextModel import TextModel
import re
# class TextModel
class TextGenerator:
'''
Generator random text from text model according to some settings.
'''
def __init__(self, text_model, out_file=sys.stdout, slen=1):
self.tm=text_model
self.out_file=out_file
self.slen=slen
def setSentenceNum(self, n):
self.slen=n
def sentence_start(self, word):
'''
whether word is a start of a sentence
'''
sentence_start_regexp=re.compile("^[A-Z].*")
if sentence_start_regexp.match(word):
return True
else:
return False
def sentence_end(self, word):
'''
whether word is an end of a sentence
'''
sentence_end_regexp=re.compile('.*[.?!]{1}["]{0,1}$')
if sentence_end_regexp.match(word):
return True
else:
return False
def genRandText(self):
'''
generator random text
start: when prefix[0] is the start of a sentence or None
end: when slen (num of sentences) are output
the output is written to the output file
'''
num=0
while (num < self.slen):
prefix=self.tm.getRandPrefix()
# output one sentence in each inner loop
flag_in_sentence=False
while True:
# if prefix[0] is None, break out, and choose a new prefix
# otherwise, the None-headed prefix would have an impact on
# the statistical model of the input text file
if prefix[0] == None:
break
# output the start of the sentence
if flag_in_sentence==False:
if self.sentence_start(prefix[0]):
flag_in_sentence=True
self.out_file.write(prefix[0])
self.out_file.write(" ")
if self.sentence_end(prefix[0]):
flag_in_sentence=False
self.out_file.write("\n")
num += 1
break
else:
# if in sentence, output
self.out_file.write(prefix[0])
self.out_file.write(" ")
if self.sentence_end(prefix[0]):
flag_in_sentence=False
self.out_file.write("\n")
num += 1
break
# get a new prefix
try:
suffix=self.tm.getRandSuffix(prefix)
prefix=list(prefix)[1:]
prefix.append(suffix)
prefix=tuple(prefix)
except:
prefix=self.tm.getRandPrefix()
# check whether the sentence ends
if prefix[-1]==None or self.sentence_end(prefix[-1]):
break;
# output remaining words of a sentence
if flag_in_sentence:
for w in prefix:
if w:
self.out_file.write(w)
self.out_file.write(" ")
if self.sentence_end(w):
flag_in_sentence=False
self.out_file.write("\n")
break
num += 1
# self-contained tests
def testTextGenerator(argv):
tm=TextModel()
if len(argv) == 0:
tm.parseFile(sys.stdin)
sys.stdin.close()
else:
fList=[]
for arg in argv:
fList.append(open(arg))
tm.parseFiles(fList)
for f in fList:
f.close()
tg=TextGenerator(tm)
tg.setSentenceNum(5)
tg.genRandText()
if __name__ == "__main__":
testTextGenerator(sys.argv[1:])