1、从网络和硬盘访问文本
>>> from urllib import urlopen
>>> url = "http://www.gutenberg.org/files/2554/2554.txt">>> raw = urlopen(url).read()
>>> type(raw)
<type 'str'>
>>> len(raw)
288
>>> raw[:100]
'<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n<html><head>\n<title>403 Forbidden</title>\n</head>'
>>> tokens = nltk.word_tokenize(raw)
>>> text = nltk.Text(tokens)
>>> type(text)
<type 'nltk.text.Text'>
>>> text[1020:1060]
['CHAPTER', 'I', 'On', 'an', 'exceptionally', 'hot', 'evening', 'early', 'in',
'July', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in',
'which', 'he', 'lodged', 'in', 'S', '.', 'Place', 'and', 'walked', 'slowly',
',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'K', '.', 'bridge', '.']
>>> text.collocations()
Katerina Ivanovna; Pulcheria
>>> raw.find("PART I")
5303
>>> raw.rfind("End of Project Gutenberg's Crime")
1157681
>>> raw = raw[5303:1157681]
>>> url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
>>> html = urlopen(url).read()
>>> html[:60]
>>> raw = nltk.clean_html(html)
>>> tokens = nltk.word_tokenize(raw)
>>> tokens = tokens[96:399]
>>> text = nltk.Text(tokens)
>>> text.concordance('gene')
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
>>> len(llog.entries)
15
>>> import os
>>> os.listdir('.')当前路径
>>> f = open('document.txt', 'rU')
93
>>> for line in f:
... print line.strip()
Time flies like an arrow.
Fruit flies like a banana
>>> s = raw_input("Enter some text: ")
Enter some text: On an exceptionally hot evening early in July
>>> print "You typed", len(nltk.word_tokenize(s)), "words."
You typed 8 words.
2、字符串,最底层的处理