- 应用对北京大学语言库(http://ccl.pku.edu.cn:8080/ccl_corpus/index.jsp?dir=xiandai )进行爬取
#coding:UTF-8
import keys
import urllib
import codecs
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
chines = u'''鲌靽搒栟鲾沘瀌嶓袯啴宬瓻犨堲荙疍鱽蹢汈厾墦剕鳡唝鳤鬹郈澴塃篊鍠鱾穄庤镃檵銍袗泜峧溠鸼浕郚铻滃鲊憖扊簉涢螠鹝勩扅倻垟婞饻栒塮泃辒鳈鳂绹蹜溦葖鄌黇峂菼腒潵湨鸤鼫蒟鲏鲪庼萩狉焌鹐梾鳑蓂辌礳醾鹲昽剅溇杧'''
for word in chines:
url = '''http://ccl.pku.edu.cn:8080/ccl_corpus/search?download_serarch_result=%E4%B8%8B%E8%BD%BD&UserMaxHits=50&dir=xiandai&q='''+urllib.quote(word.encode('utf-8'))+'''&LastQuery='''+ urllib.quote(word.encode('utf-8')) + '''&num=10000&index=FullIndex&outputFormat=TEXT&encoding=UTF-8&orderStyle=score&maxLeftLength=100&maxRightLength=100&scopestr='''
#url = '''http://ccl.pku.edu.cn:8080/ccl_corpus/search?dir=xiandai&q='''+ urllib.quote(word.encode('utf-8')) +'''&LastQuery='''+ urllib.quote(word.encode('utf-8')) +'''&start=0&num=50&index=FullIndex&outputFormat=HTML&orderStyle=score&encoding=UTF-8&neighborSortLength=0&maxLeftLength=30&maxRightLength=100&isForReading=no&scopestr='''
data = urllib.urlopen(url).read()
data = data.decode('utf-8').replace('[','').replace(']','')