
代码整理
weixin_43473864
这个作者很懒,什么都没留下…
展开
-
爬虫程序
from bs4 import BeautifulSoup import requests def one(url): r = requests.get(url, allow_redirects = False) fin = r.text soup = BeautifulSoup(fin,'html.parser') comments = soup.find('d...原创 2018-10-25 23:49:47 · 2507 阅读 · 0 评论 -
爬虫程序2
from bs4 import BeautifulSoup import requests from lxml import etree import re def get_url(url): r = requests.get(url) text = r.text #使用etree.HTML处理源代码,然后使用Xpath提取内容 html = etree.HTML(...原创 2018-10-25 23:54:52 · 143 阅读 · 0 评论 -
总结一下我写过的读文件的方式
读取excel # 读取excel worksheet = xlrd.open_workbook(filepath) table = worksheet.sheet_by_index(1)#读取第一个sheet里面的类容 datas=[] # 第一行和第二行的内容不读入 for i in range(table.nrows): if i == 0: continue ...原创 2018-10-31 10:47:59 · 89 阅读 · 0 评论 -
data-understan
这篇文章是做数据处理时输出的数据 from collections import defaultdict def get_count(fPath): invertedIndex = defaultdict(list) docNumber = 0 text=[] with open(fPath, 'r',encoding='utf-8') as f: ...原创 2018-10-23 00:38:20 · 125 阅读 · 0 评论 -
将数据保存为pickle文件
#保存为pickle文件 pipe_path='pipe_feature_nb_all.pkl' with open(pipe_path,'wb') as fw: pickle.dump(Pipe,fw) #加载pickle文件 pipe=pickle.load(open('pipe_feature_dt_all.pkl','rb'))原创 2018-11-03 16:12:51 · 2071 阅读 · 0 评论 -
json_load
with open('/Users/ronald/Downloads/Archive/resume.json','r') as f: cvResult = f.readlines() import json for c in cvResult: if (json.loads(c)['job_id']['$oid']) == '5bd30948f7bf070001464815':...原创 2018-11-12 14:08:13 · 540 阅读 · 0 评论