毛豆汽车数据爬虫–附源码
没啥教程就是简单的爬虫 加个正则 有疑问公众号后台留言给你处理。
公众号–>python网络小蜘蛛
# -*- endoding: utf-8 -*-
# @ModuleName:毛豆
# @Function(功能):
# @Author : 苏穆冰白月晨
# @Time : 2021/4/7 14:22
import requests
from fake_useragent import UserAgent
import re
import csv
headers = {
'UserAgent':UserAgent().random,
}
def response(i):
"""请求页面源码"""
url = "https://www.maodou.com/car-list/all/pg" + str(i)+ "?keyword="
resposne = requests.get(url, headers=headers).text
for i in range(0 , 14):
try:
response_re(resposne, i)
except:
break
def response_re(response, i):
"""正则匹配"""
guize_shoufu = """<p class="pre-price">首付 <em class="hot">(.*?)</em> 万</p>"""
shoufu = re.findall(guize_shoufu, response)[i]
guize_zhuti = """<span class="info">(.*?)</span></h2> <div class="car-price">"""
zhuti = re.findall(guize_zhuti, response)[i]
guize_yue = """<p class="for-month">(.*?)</p>"""
yue = re.findall(guize_yue, response)[i]
guize_tupian = """<img class="lazy" src="(.*?)" data-original=.*?alt=".*?">"""
tupian = re.findall(guize_tupian, response)[i]
guize_tupian = """<img class="lazy" src=".*?" data-original=(.*?)alt=".*?">"""
tupian = re.findall(guize_tupian, response)[i]
data = {
"Theme" : zhuti,
"Down payment" : shoufu,
"Monthly payment" : yue,
"Image" : tupian,
}
csv_writer.writerow([zhuti, shoufu, yue, tupian])
print(data)
if __name__ == '__main__':
f = open('maodou.csv', 'w', encoding='utf-8', newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(["Theme", "Down payment", 'Monthly payment', "Image"])
for i in range(0, 999):
response(i)