背景
实现自动获取网页内容,节省复制粘贴时间成本
模拟登陆
def Land_Stage(url): # 设置浏览器并打开
option = webdriver.ChromeOptions()
prefs = {'profile.default_content_settings.popups': 0}
option.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(options=option)
driver.implicitly_wait(20)
driver.get(url)
return driver
定位到目标报告,查看报告并拿到网页源码
pather = '//div[contains(text(), {})]/parent::td/../td[@class="textLeft process"]/div/div[1]'.format(fname)
driver.find_element_by_xpath(pather).click()
page = driver.page_source
利用正则,简单粗暴清洗网页源码,拿到目标数据
def Extraction_Data(page): # 清洗为字典
goal = re.findall('推广目标:(.*?)<', page)
reach = re.findall('活动触达总人数.*?<span class="size">(.*?)<', page)
buy = re.findall('购买人数<br><span>(.*?)</span>', page)
B_rate = re.findall('购买率<br><span>(.*?)</span>', page)
new_buy = re.findall('新客人数<br><span>(.*?)</span>', page)
old_buy = re.findall('老客人数<br><span>(.*?)</span>', page)
Touch_no_buy = re.findall('触达未购人数<br><span>(.*?)</span>', page)
Touch_buy = re.findall('触达已购人数<br><span>(.*?)</span>', page)
new_buy_rate = re.findall('新客购买率<br><span>(.*?)</span>', page)
old_buy_rate = re.findall('老客复购率<br><span>(.*?)</span>', page)
old_buy_Proportion = re.findall('老客占比<br><span>(.*?)</span>', page)
new_buy_Proportion = re.findall('新客占比<br><span>(.*?)</span>', page)
data = {
'推广目标:': goal,
'触达总人数:': reach,
'总购买人数:': buy,
'购买率:': B_rate,
'新客人数:': new_buy,
'老客人数:': old_buy,
'触达已购:': Touch_buy,
'触达未购:': Touch_no_buy,
'新客购买率:': new_buy_rate,
'老客购买率:': old_buy_rate,
'新客占比:': new_buy_Proportion,
'老客占比:': old_buy_Proportion
}
return data