# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import json
import requests
from requests.exceptions import RequestException
import re
import time
import pymysql
import pandas
url = 'https://sh.lianjia.com/ershoufang/'
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
html = get_one_page(url)
def get_re(res):
titles = re.compile(res, re.S)
titles = re.findall(titles, html)
return titles
url2 = []
res = '<a href="/ershoufang/\w.+?"\s\stitle'
titles =get_re(res)
for title in titles:
x = title.split('/')
x = x[2]
url = 'https://sh.lianjia.com/xiaoqu/'
url += x
url2.append(url)
url3 = []
for url in url2 :
html = get_one_page(url)
res = '<a href="/xiaoqu/\w+/"\s>'
titles = get_re(res)
for title in titles :
x = title.split('/')
x = x[2]
url = 'https://sh.lianjia.com/xiaoqu/'
url += x
url3.append(url)
url4 = []
count =1
try:
for url in url3:
html = get_one_page(url)
res = '"totalPage":\w+,"'
num = get_re(res)
num = num[0]
res = '\d+'
num1 = re.compile(res, re.S)
nums = re.findall(num1, num)
print(nums)
nums = int(nums[0])+1
for num in range(1,nums) :
y = '/pg'
num =str(num)
y +=num
url1 = url
url1 += y
z = '/'
url1 += z
url4.append(url1)
print('percent: {:.0%}'.format( count/len(url3)))
count = count +1
except IndexError:
None
url5 = []
xiaoqu1 = []
count =1
try:
for url in url4 :
html = get_one_page(url)
res = 'href="https://sh.lianjia.com/xiaoqu/\d+/" target="_blank">.+?</a>'
titles = get_re(res)
for title in titles:
x = title.split('"')
xiaoqu = x[3]
xiaoqu1.append(xiaoqu)
url = x[1]
url5.append(url)
print('percent: {:.0%}'.format( count/len(url4)))
count = count +1
except IndexError:
None
count = 1
xiaoqu_list = []
for url in url5:
try:
x = []
html = get_one_page(url)
res = 'class="detailTitle">.+?<'
xiaoqumingcheng = get_re(res)
xiaoqumingcheng = xiaoqumingcheng[0].split('>')
xiaoqumingcheng = xiaoqumingcheng[1].replace('<','')
x.insert(1,xiaoqumingcheng)
res = 'class="detailDesc">.+?<'
dizhi = get_re(res)
dizhi = dizhi[0].split('>')
dizhi = dizhi[1].replace('<','')
x.insert(2,dizhi)
res = 'class="xiaoquUnitPrice">\w+<'
price = get_re(res)
price = price[0].split('>')
price = price[1].replace('<','')
x.insert(3,price)
res = 'class="xiaoquInfoContent">\w+年建成\s<'
year = get_re(res)
year = year[0].split('>')
year = year[1].replace('<','')
x.insert(4,year)
xiaoqu_list.append(x)
print (count)
print('percent: {:.0%}'.format(count/len(url5)))
count = count+1
except :
None