Random Text Generator (Markov)

最新推荐文章于 2025-06-03 08:15:00 发布

转载最新推荐文章于 2025-06-03 08:15:00 发布 · 219 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：https://my.oschina.net/u/158589/blog/94964

文章标签：

#python

本文介绍了一个使用标记算法实现的随机英文文本生成器，它通过解析输入文件来构建统计模型，并据此生成有意义的随机英文文本。程序提供了一个简单的版本，用于基于输入文件的简单解析和统计建模，以及一个更完整的版本，包括类TextModel来封装文本模型，类TextGenerator用于根据给定的文本模型生成随机文本。

2019独角兽企业重金招聘Python工程师标准>>>

Random Text Generator:

Generator "meaningful" random English text according to some input sample files.

Use markov algorithm as the basic algorithm to implement this generator.

Simple Version:

#!/usr/bin/env python

'''
This module offers functions which parse an English text file
and generate a statistical model of the text.
'''

# simple parse function that parses a file and returns a dictionary
# containing the statistical model
def parse_simple(filepath, prefixnum=2):
    '''
    parse a file specified in filepath and return a dict which has
    the form of {(p1, p2, .., pn):[s1, s2, ..., sm], ...}
    '''
    dict_stat_model={}
    w1=w2=w3=None
    
    try:
        with open(filepath) as f:
            # parse file line by line, word by word
            for line in f:
                for w3 in line.split():
                    dict_add(dict_stat_model,w1,w2,w3)
                    w1=w2
                    w2=w3
            dict_add(dict_stat_model, w1, w2, None)
    except IOError as e:
        return None
    
    return dict_stat_model

# helper functions
def dict_add(d, w1, w2, w3):
    if (w1, w2) not in d:
        d[(w1, w2)] = []
    if w3 not in d[(w1, w2)]:
        d[(w1, w2)].append(w3)

def print_dict(d):
    for key in d:
        print key, d[key]
    print

# self-contained test functions
# return the number of failed test cases
def test_parse_simple():
    failed_test_cases=0
    empty_file_path='test/empty.txt'
    short_file_path='test/short.txt'
    long_file_path='test/long.txt'
    invalid_file_path="INVALID_FILE_PATH"
    # test with empty input file
    dict_stat=parse_simple(empty_file_path)
    if dict_stat==None: failed_test_cases += 1
    print '[empty file]'
    print_dict(dict_stat)
    # test with short input file
    dict_stat=parse_simple(short_file_path)
    if dict_stat==None: failed_test_cases += 1
    print '[short file]'
    print_dict(dict_stat)
    # test with long input file
    dict_stat=parse_simple(long_file_path)
    if dict_stat==None: failed_test_cases += 1
    print '[long file]'
    print_dict(dict_stat)
    # test with invalid input file (not exist)
    dict_stat=parse_simple(invalid_file_path)
    if dict_stat!=None: failed_test_cases += 1
    return failed_test_cases

# run tests
if __name__ == '__main__':
    print '========== parse.py: self-contained tests start ====='
    failed_test_cases=test_parse_simple()
    print 'test_parse_simple: %s test cases failed' % failed_test_cases
    print '========== parse.py: self-contained tests end ======='

#!/usr/bin/env python

'''
This program generates random English text based on the input file.
Usage: gen.py <inputfile>
'''

import sys
import parse
from random import choice

def gen_rand_text(filepath):
    dict_stat=parse.parse_simple(filepath)
    if dict_stat==None: 
        print 'parse % failed, maybe invalid file path' % filepath
        return 1
    # deal with dict_stat and output random text based on it
    # parse.print_dict(dict_stat)
    w1=w2=w3=None
    while True:
        w3=choice(dict_stat[(w1,w2)])
        if w3 == None: break
        if w1 != None: print w1,
        w1=w2
        w2=w3
    if w1 != None:
        print w1, w2
    print
    return 0
        

def usage():
    print "Usage: gen.py <inputfile>"
    
if __name__=="__main__":
    # check command arguments
    if len(sys.argv) != 2:
        usage()
        sys.exit(1)
    gen_rand_text(sys.argv[1])

More complete version:

#!/usr/bin/env python

'''
This module offers a class that represents the text model of 
some input text files -- TextModel. 
'''

from random import choice
import sys

class TextModel:
    def __init__(self, prefix_len=2):
        self.state_dict={}
        self.prefix_list=[]
        self.prefix_len=prefix_len
        # prefix_len should be within the range of [1,5], 
        # otherwise, it's meaningless
        assert self.prefix_len>=1
        assert self.prefix_len<=5
    def __str__(self):
        return str(self.state_dict)
    def add(self, prefix, suffix):
        '''
        add prefix,suffix to state_dict
        '''
        if prefix not in self.state_dict:
            self.state_dict[prefix]=[suffix]
            self.prefix_list.append(prefix)
        else:
            self.state_dict[prefix].append(suffix)
    def getSuffixes(self, prefix):
        return self.state_dict[prefix]
    def getRandSuffix(self, prefix):
        suffixes=self.getSuffixes(prefix)
        return choice(suffixes)
    def getRandPrefix(self):
        return choice(self.prefix_list)
    def parseFile(self, f):
        '''
        parse file to form internal data structures which stores the
        statistical model of this text file
        f should be a file object
        '''
        assert isinstance(f, file)
        wlist=[]
        for i in range(0, self.prefix_len):
            wlist.append(None)
        for line in f:
            for w in line.split():
                # add to dict
                self.add(tuple(wlist), w)
                # shift by one
                wlist=wlist[1:]
                wlist.append(w)
        self.add(tuple(wlist), None)
    def parseFiles(self, fList):
        for f in fList:
            self.parseFile(f)


# self-contained tests for this module
def testFileParsing(tm, argv):
    if len(argv) == 0:
        tm.parseFile(sys.stdin)
        sys.stdin.close()
    else:
        fList=[]
        for arg in argv:
            fList.append(open(arg))
        tm.parseFiles(fList)
        for f in fList:
            f.close()

def testRandPrefixAndSuffix(tm):
    for i in range(0, 10):
        prefix=tm.getRandPrefix()
        suffix=tm.getRandSuffix(prefix)
        print prefix, suffix
    
    
if __name__ == "__main__":
    tm=TextModel()
    testFileParsing(tm, sys.argv[1:])
    print tm
    testRandPrefixAndSuffix(tm)

#!/usr/bin/env python

'''
This module provides a class -- TextGenerator.
This class parses the a given text model and generate
output according to the model.
'''

import sys
from TextModel import TextModel
import re
# class TextModel
class TextGenerator:
    '''
    Generator random text from text model according to some settings.
    '''
    def __init__(self, text_model, out_file=sys.stdout, slen=1):
        self.tm=text_model
        self.out_file=out_file
        self.slen=slen
    def setSentenceNum(self, n):
        self.slen=n
    def sentence_start(self, word):
        '''
        whether word is a start of a sentence
        '''
        sentence_start_regexp=re.compile("^[A-Z].*")
        if sentence_start_regexp.match(word):
            return True
        else:
            return False
    def sentence_end(self, word):
        '''
        whether word is an end of a sentence
        '''
        sentence_end_regexp=re.compile('.*[.?!]{1}["]{0,1}$')
        if sentence_end_regexp.match(word):
            return True
        else:
            return False
    def genRandText(self):
        '''
        generator random text
        start: when prefix[0] is the start of a sentence or None
        end: when slen (num of sentences) are output
        the output is written to the output file
        '''
        num=0
        while (num < self.slen):
            prefix=self.tm.getRandPrefix()
            # output one sentence in each inner loop
            flag_in_sentence=False
            while True:
                # if prefix[0] is None, break out, and choose a new prefix
                # otherwise, the None-headed prefix would have an impact on
                # the statistical model of the input text file
                if prefix[0] == None:
                    break
                # output the start of the sentence
                if flag_in_sentence==False:
                    if self.sentence_start(prefix[0]):
                        flag_in_sentence=True
                        self.out_file.write(prefix[0])
                        self.out_file.write(" ")
                        if self.sentence_end(prefix[0]):
                            flag_in_sentence=False
                            self.out_file.write("\n")
                            num += 1
                            break
                else:
                    # if in sentence, output
                    self.out_file.write(prefix[0])
                    self.out_file.write(" ")
                    if self.sentence_end(prefix[0]):
                        flag_in_sentence=False
                        self.out_file.write("\n")
                        num += 1
                        break
                # get a new prefix
                try:
                    suffix=self.tm.getRandSuffix(prefix)
                    prefix=list(prefix)[1:]
                    prefix.append(suffix)
                    prefix=tuple(prefix)
                except:
                    prefix=self.tm.getRandPrefix()
                # check whether the sentence ends
                if prefix[-1]==None or self.sentence_end(prefix[-1]):
                    break;
            # output remaining words of a sentence
            if flag_in_sentence:
                for w in prefix:
                    if w:
                        self.out_file.write(w)
                        self.out_file.write(" ")
                        if self.sentence_end(w):
                            flag_in_sentence=False
                            self.out_file.write("\n")
                            break
                num += 1

# self-contained tests
def testTextGenerator(argv):
    tm=TextModel()
    if len(argv) == 0:
        tm.parseFile(sys.stdin)
        sys.stdin.close()
    else:
        fList=[]
        for arg in argv:
            fList.append(open(arg))
        tm.parseFiles(fList)
        for f in fList:
            f.close()
    tg=TextGenerator(tm)
    tg.setSentenceNum(5)
    tg.genRandText()

if __name__ == "__main__":
    testTextGenerator(sys.argv[1:])

转载于:https://my.oschina.net/u/158589/blog/94964