Retry Class



http://www.cnblogs.com/mszhangxuefei/p/worknotes-10.html

//Retry机制
    public static class Retry { /// <summary>
        /// 重试零个参数无返回值的方法 /// </summary>
        /// <param name="action">执行方法方法</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        public static void Execute(Action action, TimeSpan retryInterval, int retryCount = 3) { Execute<object>(() => { action(); return null; }, retryInterval, retryCount); } /// <summary>
        /// 重试一个参数无返回值的方法 /// </summary>
        /// <typeparam name="T1">参数类型1</typeparam>
        /// <param name="action">执行方法方法</param>
        /// <param name="arg1">参数1</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        public static void Execute<T1>(Action<T1> action, T1 arg1, TimeSpan retryInterval, int retryCount = 3) { Execute<T1, object>((x1) => { action(arg1); return null; }, arg1, retryInterval, retryCount); } /// <summary>
        /// 重试两个参数无返回值的方法 /// </summary>
        /// <typeparam name="T1">参数类型1</typeparam>
        /// <typeparam name="T2">参数类型2</typeparam>
        /// <param name="action">执行方法方法</param>
        /// <param name="arg1">参数1</param>
        /// <param name="arg2">参数2</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        public static void Execute<T1, T2>(Action<T1, T2> action, T1 arg1, T2 arg2, TimeSpan retryInterval, int retryCount = 3) { Execute<T1, T2, object>((x1, x2) => { action(arg1, arg2); return null; }, arg1, arg2, retryInterval, retryCount); } /// <summary>
        /// 重试三个参数无返回值的方法 /// </summary>
        /// <typeparam name="T1">参数类型1</typeparam>
        /// <typeparam name="T2">参数类型2</typeparam>
        /// <typeparam name="T3">参数类型3</typeparam>
        /// <param name="action">执行方法方法</param>
        /// <param name="arg1">参数1</param>
        /// <param name="arg2">参数2</param>
        /// <param name="arg3">参数3</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        public static void Execute<T1, T2, T3>(Action<T1, T2, T3> action, T1 arg1, T2 arg2, T3 arg3, TimeSpan retryInterval, int retryCount = 3) { Execute<T1, T2, T3, object>((x1, x2, x3) => { action(arg1, arg2, arg3); return null; }, arg1, arg2, arg3, retryInterval, retryCount); } /// <summary>
        /// 重试四个参数无返回值的方法 /// </summary>
        /// <typeparam name="T1">参数类型1</typeparam>
        /// <typeparam name="T2">参数类型2</typeparam>
        /// <typeparam name="T3">参数类型3</typeparam>
        /// <typeparam name="T4">参数类型4</typeparam>
        /// <param name="action">执行方法方法</param>
        /// <param name="arg1">参数1</param>
        /// <param name="arg2">参数2</param>
        /// <param name="arg3">参数3</param>
        /// <param name="arg4">参数4</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        public static void Execute<T1, T2, T3, T4>(Action<T1, T2, T3, T4> action, T1 arg1, T2 arg2, T3 arg3, T4 arg4, TimeSpan retryInterval, int retryCount = 3) { Execute<T1, T2, T3, T4, object>((x1, x2, x3, x4) => { action(arg1, arg2, arg3, arg4); return null; }, arg1, arg2, arg3, arg4, retryInterval, retryCount); } /// <summary>
        /// 重试零个参数带返回值 /// </summary>
        /// <typeparam name="T">返回类型</typeparam>
        /// <param name="func">执行的方法</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        /// <returns>返回类型T</returns>
        public static T Execute<T>(Func<T> func, TimeSpan retryInterval, int retryCount = 3) { var exceptions = new List<Exception>(); for (int retry = 0; retry < retryCount; retry++) { try { return func(); } catch (Exception ex) { exceptions.Add(ex); Thread.Sleep(retryInterval); } } throw new AggregateException(exceptions); } /// <summary>
        /// 重试一个参数带返回值 /// </summary>
        /// <typeparam name="T1">参数类型1</typeparam>
        /// <typeparam name="T">返回类型</typeparam>
        /// <param name="func">执行的方法</param>
        /// <param name="arg1">参数1</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        /// <returns>返回类型T</returns>
        public static T Execute<T1, T>(Func<T1, T> func, T1 arg1, TimeSpan retryInterval, int retryCount = 3) { var exceptions = new List<Exception>(); for (int retry = 0; retry < retryCount; retry++) { try { return func(arg1); } catch (Exception ex) { exceptions.Add(ex); Thread.Sleep(retryInterval); } } throw new AggregateException(exceptions); } /// <summary>
        /// 重试两个参数带返回值 /// </summary>
        /// <typeparam name="T1">参数类型1</typeparam>
        /// <typeparam name="T2">参数类型2</typeparam>
        /// <typeparam name="T">返回类型</typeparam>
        /// <param name="func">执行的方法</param>
        /// <param name="arg1">参数1</param>
        /// <param name="arg2">参数2</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        /// <returns>返回类型T</returns>
        public static T Execute<T1, T2, T>(Func<T1, T2, T> func, T1 arg1, T2 arg2, TimeSpan retryInterval, int retryCount = 3) { var exceptions = new List<Exception>(); for (int retry = 0; retry < retryCount; retry++) { try { return func(arg1, arg2); } catch (Exception ex) { exceptions.Add(ex); Thread.Sleep(retryInterval); } } throw new AggregateException(exceptions); } /// <summary>
        /// 重试三个参数带返回值 /// </summary>
        /// <typeparam name="T1">参数类型1</typeparam>
        /// <typeparam name="T2">参数类型2</typeparam>
        /// <typeparam name="T3">参数类型3</typeparam>
        /// <typeparam name="T">返回类型</typeparam>
        /// <param name="func">执行的方法</param>
        /// <param name="arg1">参数1</param>
        /// <param name="arg2">参数2</param>
        /// <param name="arg3">参数3</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        /// <returns>返回类型T</returns>
        public static T Execute<T1, T2, T3, T>(Func<T1, T2, T3, T> func, T1 arg1, T2 arg2, T3 arg3, TimeSpan retryInterval, int retryCount = 3) { var exceptions = new List<Exception>(); for (int retry = 0; retry < retryCount; retry++) { try { return func(arg1, arg2, arg3); } catch (Exception ex) { exceptions.Add(ex); Thread.Sleep(retryInterval); } } throw new AggregateException(exceptions); } /// <summary>
        /// 重试四个参数带返回值 /// </summary>
        /// <typeparam name="T1">参数类型1</typeparam>
        /// <typeparam name="T2">参数类型2</typeparam>
        /// <typeparam name="T3">参数类型3</typeparam>
        /// <typeparam name="T4">参数类型4</typeparam>
        /// <typeparam name="T">返回类型</typeparam>
        /// <param name="func">执行的方法</param>
        /// <param name="arg1">参数1</param>
        /// <param name="arg2">参数2</param>
        /// <param name="arg3">参数3</param>
        /// <param name="arg4">参数4</param>
        /// <param name="retryInterval">重试间隔</param>
        /// <param name="retryCount">重试次数</param>
        /// <returns>返回类型T</returns>
        public static T Execute<T1, T2, T3, T4, T>(Func<T1, T2, T3, T4, T> func, T1 arg1, T2 arg2, T3 arg3, T4 arg4, TimeSpan retryInterval, int retryCount = 3) { var exceptions = new List<Exception>(); for (int retry = 0; retry < retryCount; retry++) { try { return func(arg1, arg2, arg3, arg4); } catch (Exception ex) { exceptions.Add(ex); Thread.Sleep(retryInterval); } } throw new AggregateException(exceptions); } }

import requests from bs4 import BeautifulSoup import csv import time import random import re import json from urllib.parse import urljoin from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry class LianjiaSpider: def __init__(self): self.base_url = "https://cd.lianjia.com/ershoufang/" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': self.base_url, 'X-Requested-With': 'XMLHttpRequest' } self.session = requests.Session() retry = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504]) self.session.mount('https://', HTTPAdapter(max_retries=retry)) self.session.get(self.base_url, headers=self.headers, timeout=30) self.house_data = [] def get_page_content(self, url): for i in range(3): try: time.sleep(random.uniform(2, 4)) resp = self.session.get(url, headers=self.headers, timeout=30) resp.encoding = 'utf-8' if resp.status_code == 200: return resp.text else: print(f"请求失败,状态码: {resp.status_code}") except Exception as e: print(f"请求出错: {e} 第{i + 1}次") if i == 2: return None time.sleep(random.uniform(3, 6)) return None def parse_list_page(self, html): soup = BeautifulSoup(html, 'html.parser') house_links = [] for a in soup.select('li.clear a.noresultRecommend'): href = a.get('href') if href: house_links.append(href if href.startswith('http') else 'https:' + href) return house_links def parse_detail_page(self, url, html): soup = BeautifulSoup(html, 'html.parser') house_info = {} try: # 1. 小区 community = soup.find('div', class_='communityName') house_info['小区'] = community.find('a').text.strip() if community else '' # 2. 行政区 area_info = soup.find('div', class_='areaName') if area_info: area_links = area_info.find_all('a') house_info['行政区域'] = area_links[0].text.strip() if area_links else '' else: house_info['行政区域'] = '' # 3. 总价/单价 total_price = soup.find('span', class_='total') house_info['总价(万元)'] = total_price.text.strip() if total_price else '' unit_price = soup.find('span', class_='unitPriceValue') house_info['单价(元/㎡)'] = unit_price.text.strip() if unit_price else '' # 4. 基础信息 base_info = soup.find('div', class_='base') if base_info: for item in base_info.find_all('li'): label = item.find('span', class_='label') if not label: continue label_text = label.text.strip() value = item.text.replace(label_text, '').strip() if '房屋户型' in label_text: house_info['户型'] = value elif '所在楼层' in label_text: house_info['所在楼层'] = value elif '建筑面积' in label_text: house_info['建筑面积(㎡)'] = value.replace('㎡', '').strip() elif '户型结构' in label_text: house_info['户型结构'] = value elif '建筑类型' in label_text: house_info['建筑类型'] = value elif '房屋朝向' in label_text: house_info['房屋朝向'] = value elif '建筑结构' in label_text: house_info['建筑结构'] = value elif '装修情况' in label_text: house_info['装修情况'] = value elif '梯户比例' in label_text: house_info['梯户比例'] = value # 5. 交易信息 transaction_info = soup.find('div', class_='transaction') if transaction_info: for item in transaction_info.find_all('li'): label = item.find('span', class_='label') if not label: continue label_text = label.text.strip() value = item.text.replace(label_text, '').strip() if '房屋年限' in label_text or '建成年代' in label_text: house_info['房屋年限'] = value elif '产权所属' in label_text: house_info['产权所属'] = value # 6 house_info = {'距离地铁站(m)': '', '最近地铁站': ''} url = 'https://cd.lianjia.com/ershoufang/house/xxxxx.html' # 示例 URL house_id = url.strip('/').split('/')[-1].replace('.html', '') # 6-1 先 POST 一次“周边配套”激活数据 activate_url = f'https://cd.lianjia.com/ershoufang/house/{house_id}/aroundInfo/' try: response = self.session.post(activate_url, headers={ 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/json' }, timeout=15) if response.status_code != 200: print("Failed to activate data:", response.status_code) except requests.exceptions.RequestException as e: print("Error during POST request:", e) # 6-2 再 GET 拿 JSON try: response = self.session.get(activate_url, headers={ 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'application/json, text/plain, */*' }, timeout=15) if response.status_code == 200: data = response.json() subway_list = data.get('subway', []) if subway_list: s = subway_list[0] line = s.get('line', '').replace('地铁', '') name = s.get('name', '') dist = s.get('distance', '') dist_num = re.search(r'\d+', dist).group() if dist else '' house_info['最近地铁站'] = f'地铁{line} {name}' house_info['距离地铁站(m)'] = dist_num else: print("Failed to retrieve data:", response.status_code) except requests.exceptions.RequestException as e: print("Error during GET request:", e) except ValueError as e: print("Error parsing JSON:", e) # 7. 补全缺失字段 fields = ['小区', '行政区域', '总价(万元)', '单价(元/㎡)', '户型', '所在楼层', '装修情况', '梯户比例', '房屋年限', '产权所属', '建筑面积(㎡)', '户型结构', '建筑类型', '房屋朝向', '建筑结构', '距离地铁站(m)', '最近地铁站', '地铁线路'] for f in fields: house_info.setdefault(f, '') return house_info except Exception as e: print(f"解析详情页出错 {url}: {e}") return None def crawl(self, max_pages=3): print("开始爬取链家成都二手房数据...") for page in range(1, max_pages + 1): print(f"\n正在爬取第 {page} 页...") page_url = self.base_url if page == 1 else f"{self.base_url}pg{page}/" html = self.get_page_content(page_url) if not html: print(f"放弃第 {page} 页") continue house_links = self.parse_list_page(html) print(f"本页拿到 {len(house_links)} 条") for idx, link in enumerate(house_links, 1): detail_html = self.get_page_content(link) if detail_html: info = self.parse_detail_page(link, detail_html) if info: self.house_data.append(info) subway_info = f"{info.get('最近地铁站', '无')} ({info.get('地铁线路', '')}) {info.get('距离地铁站(m)', '')}米" print(f" 完成 {idx}/{len(house_links)} {info.get('小区', 'N/A')} - 地铁: {subway_info}") time.sleep(random.uniform(1.5, 3)) time.sleep(random.uniform(5, 8)) print(f"\n爬取结束!共 {len(self.house_data)} 条") def save_to_csv(self, filename='chengdu_lianjia_ershoufang.csv'): if not self.house_data: print("没有数据可保存") return fieldnames = ['小区', '行政区域', '总价(万元)', '单价(元/㎡)', '户型', '所在楼层', '装修情况', '梯户比例', '房屋年限', '产权所属', '建筑面积(㎡)', '户型结构', '建筑类型', '房屋朝向', '建筑结构', '距离地铁站(m)', '最近地铁站', '地铁线路'] with open(filename, 'w', encoding='utf-8-sig', newline='') as f: csv.DictWriter(f, fieldnames=fieldnames).writeheader() csv.DictWriter(f, fieldnames=fieldnames).writerows(self.house_data) print(f"数据已保存到 {filename}") if __name__ == "__main__": spider = LianjiaSpider() spider.crawl(max_pages=1) spider.save_to_csv() 使用这个代码地铁信息爬取不出来,这个地铁信息在房源详情页的周边配置板块,修改代码,让地铁信息爬取出来
11-17
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值