#!/usr/bin/python
# -*- coding: GBK -*-
import os, sys, time
import requests
import arrow
import hashlib
from unipath import Path
def file_get_contents( url):
s = requests.Session()
headers={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection":"keep-alive",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:55.0) Gecko/20100101 Firefox/55.0"
}
r = s.get( url, timeout=10,headers=headers)
r.encoding = 'gbk'
return r.text
def by_market( market_id):
page = 1
page_count = 1;
url_template = ''
for start in arrow.Arrow.range('quarter', arrow.get('2014-01-01','YYYY-MM-DD'), arrow.now()):
url = url_template.format( **{ 'market_id':market_id, 'start_time': start.format("YYYY-MM-DD"), 'end_time':start.shift( months=3).format("YYYY-MM-DD"),'page': page})
Item( url)
class Item:
url = ''
content = ''
def __init__( self, url):
self.url = url
self.content = self.get_contents( url)
print( self.content)
def get_contents( self, url):
cache = Path('./cache/'+hashlib.md5( url.encode(encoding='GBK')).hexdigest())
if cache.exists():
content = cache.read_file()
else:
content = file_get_contents( url)
cache.write_file( content)
time.sleep(1)
return content;
#print( file_get_contents( url))
by_market( 2604461)
Python抓取某平台代码
最新推荐文章于 2025-09-08 10:18:06 发布
本文介绍了一个使用Python进行网络爬取的实例,通过定义特定的URL模板和日期范围,该脚本能够从指定市场ID获取季度性的网页数据。利用Requests库进行HTTP请求,并采用简单的缓存机制来提高效率。
1015





