# csv load
import nltk
import csv
import json
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import blankline_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer # import Porter stemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import urllib.request as urllib
from bs4 import BeautifulSoup
from nltk.metrics import edit_distance
# nltk.download('punkt')
nltk.download()
# csv load
with open('600000.csv', 'r') as f:
reader = csv.reader(f, delimiter=',', quotechar='"')
# for line in reader :
# print(line[1] ) # assuming the second field is the raw sting
rows = [row for row in reader]
print(rows)
# json load
jsonfile = open('example.json')
data = json.load(jsonfile)
print(data)
# 文本清洗 如章节一
# 句子拆分器
# from
NLTK文本整理和清洗示例代码
最新推荐文章于 2024-11-01 18:31:52 发布