# -*- coding: utf-8 -*-
import requests
from requests import RequestException
import re
from pyquery import PyQuery as pq
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Mobile Safari/537.36'}
def get_index_html():
gonglv_url = 'https://m.mafengwo.cn/jd/10195/gonglve.html?sExt=gonglve&ext=gonglve'
try:
response = requests.get(gonglv_url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def get_index_url(index_html):
pattern = re.compile('<a.*?href="(/poi.*?)".*?', re.S)
urls = re.findall(pattern, index_html)
for url in urls:
url = 'https://m.mafengwo.cn' + url
yield url
return url
def get_html(url):
try:
爬取马蜂窝景点的数据
最新推荐文章于 2024-10-16 18:21:16 发布