#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-09-10 20:19:29
# Project: reo
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
# on_start(self) 程序的入口,当点击左侧绿色区域右上角的 run 按钮时首先会调用这个函数
@every(minutes=24 * 60)
def on_start(self):
# 用于创建一个爬取任务,url 为目标地址,callback 为抓取到数据后的回调函数
self.crawl('www.reeoo.com', callback=self.index_page, validate_cert=False)
# 参数为 Response 对象
# response.doc 为 pyquery 对象,pyquery和jQuery类似,,主要用来方便地抓取返回的html文档中对应标签的数据
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
# for each in response.doc('a[href^="http"]').items():
# self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
for each in response.doc('div[class="thumb"]').items():
detail_url = each('a').attr.href
print(detail_url)
self.crawl(detail_url, callback=self.detail_page, validate_cert=False)
@config(priority=2)
# 返回一个 dict 对象作为结果,结果会自动保存到默认的 resultdb 中,也可以通过重载方法把结果数据存储到指定的数据库,
def detail_page(self, response):
header = response.doc('body > article > section > header')
title = header('h1').text()
tags = []
for each in header.items('a'):
tags.append(each.text())
content = response.doc('div[id="post_content"]')
description = content('blockquote > p').text()
website_url = content('a').attr.href
image_url_list = []
for each in content.items('img[data-src]'):
image_url_list.append(each.attr('data-src'))
return {
"title": title,
"tags": tags,
"description": description,
"image_url_list": image_url_list,
"website_url": website_url,
}
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-09-10 20:19:29
# Project: reo
from pyspider.libs.base_handler import *
import urllib
class Handler(BaseHandler):
crawl_config = {
}
# on_start(self) 程序的入口,当点击左侧绿色区域右上角的 run 按钮时首先会调用这个函数
@every(minutes=24 * 60)
def on_start(self):
# 用于创建一个爬取任务,url 为目标地址,callback 为抓取到数据后的回调函数
self.crawl('http://www.mzitu.com/150114', callback=self.index_page, validate_cert=False)
# 参数为 Response 对象
# response.doc 为 pyquery 对象,pyquery和jQuery类似,,主要用来方便地抓取返回的html文档中对应标签的数据
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
# for each in response.doc('a[href^="http"]').items():
# self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
for each in response.doc('.main-image img').items():
detail_url = each('img').attr.src
print(detail_url)
# 网络上图片的地址
#img_src =detail_url
# 将远程数据下载到本地,第二个参数就是要保存到本地的文件名
#urllib.request.urlretrieve(img_src,'D:\\1.jpg')
for each in response.doc('.pagenavi > a:last-child').items():
# 根据拿到的下一页url继续调用detail_page函数,获取下一页的图片和下一页中的下一页
self.crawl(each.attr.href, callback=self.index_page, validate_cert=False)
#self.crawl(detail_url, callback=self.detail_page, validate_cert=False)
@config(priority=2)
# 返回一个 dict 对象作为结果,结果会自动保存到默认的 resultdb 中,也可以通过重载方法把结果数据存储到指定的数据库,
def detail_page(self, response):
return {
"title": "title",
}
# #header = response.doc('body > article > section > header')
# #
# # title = header('h1').text()
# #
# # tags = []
# # for each in header.items('a'):
# # tags.append(each.text())
# #
# # content = response.doc('div[id="post_content"]')
# # description = content('blockquote > p').text()
# #
# # website_url = content('a').attr.href
# #
# # image_url_list = []
# # for each in content.items('img[data-src]'):
# # image_url_list.append(each.attr('data-src'))
# #
# return {
# "title": title,
# "tags": tags,
# "description": description,
# "image_url_list": image_url_list,
# "website_url": website_url,
# }
# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-09-13 21:28:59
# Project: test003
from pyspider.libs.base_handler import *
import urllib.request
import time
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl(
'https://www.163.com/',
fetch_type='js', callback=self.index_page, validate_cert=False,js_viewport_height=12000)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
count = 0;
for each in response.doc('img').items():
detail_url = each('img').attr.src
print(detail_url)
work_path = 'E:\\phantomjsTest\\temp\\' + str(count) + '.jpg'
count = count + 1
if detail_url:
urllib.request.urlretrieve(detail_url, work_path)
else:
print('is none')
# urllib.request.urlretrieve(detail_url, work_path)
@config(priority=2)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
}