pip install virtualenv
pip install -i https://pypi.doubanio.com/simple/ --trusted-host pypi.doubanio.com django
cmd 下 执行 where python
mkvirtualenv -p D:\ProgramData\Anaconda3\python.exe scrapypy3
mkvirtualenv -p D:\ProgramData\Anaconda3\python.exe py3scrapy
pip install virtualenvwrapper-win
workon
mkdirtualenv py3scrapy
指定env目录 设置环境变量 WORKON_HOME = E:\scrapy\Envs
设置path路径
C:\Users\Gapproxy\scrapytest\Scripts\scrapypy3\Scripts
workon py3scrapy
pip install requests
pip install -i https://pypi.doubanio.com/simple/ scrapy
mkvirtualenv -p D:\ProgramData\Anaconda3\python.exe article_spider
pip install -i https://pypi.doubanio.com/simple/ scrapy
scrapy startproject ArticleSpider
项目 在
e:\vwmare-system\scrapy\envs\article_spider\lib\site-packages\scrapy\templates\project
C:\Users\72038634\PycharmProjects\sc
导入pythcharm
cd ArticleSpider
scrapy genspider example example.com
scrapy genspider jobboole blog.jobbole.com
scrapy crawl jobboole
pip install -i https://pypi.doubanio.com/simple/ pypiwin32
//*[@id=“post-110287”]/div[1]/h1
scrapy shell http://blog.jobbole.com/110287
title = response.xpath("//div[@class=‘entry-header’]/h1/text()").extract()[0]
title.extract()
title.extract()[0]
create_date = response.xpath("//p[@class=‘entry-meta-hide-on-mobile’]/text()").extract()[0].strip().replace(’·’,’’)
create_date
//*[@id=“post-110287”]/div[3]/div[9]/span[2]
//span[contains(@class,‘vote-post-up’)]
vote_num = int(response.xpath("//span[contains(@class,‘vote-post-up’)]/h10/text()").extract()[0])
book_mark_num = re.match(r".(\d+).",response.xpath("//span[contains(@class,‘bookmark-btn’)]/text()").extract()[0]).group(1)
article_comment_num = re.match(r".(\d+).",response.xpath("//a[@href=’#article-comment’]/span/text()").extract()[0]).group(1)
author = response.xpath("//div[@class=‘copyright-area’]/a/text()").extract()[0]
tag_list = response.xpath("//p[@class=‘entry-meta-hide-on-mobile’]/a/text()").extract()
tag_list_temp = []
for element in tag_list :
match_re = re.match(r".(\d+).", element)
if match_re:
tag_list_temp.append(match_re.group(0).replace(match_re.group(1),’’).replace(’ ‘,’’))
else:
tag_list_temp.append(element)
tags = “,”.join(tag_list_temp)
response.css(’#archive .post.floated-thumb .post-thumb a::attr(href)’).extract()
response.css(’.floated-thumb .post-thumb a::attr(href)’).extract()
response.css(“a::attr(href)”).extract()
response.css(“div .navigation .margin-20 a::attr(href)”).extract()
response.css(“navigation class:next a::attr(href)”).extract_first()
response.css("#archive > div.navigation.margin-20").extract()
response.css("div:(#archive) ").extract()
scrapy shell http://web.jobbole.com/category/javascript-2/