用python抓一了一些数据存到本地

import codecs

from xml.dom.minidom import Document
import requests
from bs4 import BeautifulSoup

doc = Document()
def getAllUrl(pageCount):
    url='https://www.xxx.co/xxxx/{page}'
    return  url.format(page=pageCount)

def getHtml(pageCount):
    html = requests.get(getAllUrl(pageCount))
    return html

def WirteXml(gName,gImg,wUrl):
    girlName = gName
    girlImage = gImg
    webUrl = wUrl
    name = doc.createElement("name")
    aperson.appendChild(name)
    personname = doc.createTextNode(girlName)
    name.appendChild(personname)
    img = doc.createElement("imgUrl")
    aperson.appendChild(img)
    prersonUrl = doc.createTextNode(girlImage)
    img.append.Child(prersonUrl)
    weburl = doc.createElement("webUrl")
    aperson.appendChild(weburl)
    personname = doc.createTextNode(webUrl)
    weburl.appendChild(personname)

if __name__ == '__main__':
   # f = codecs.open('Conker.txt', 'w', 'utf-8')
    filename = "people.xml"
    f = codecs.open(filename, "w", 'utf-8')
    people = doc.createElement("Actresses")
    doc.appendChild(people)
    aperson = doc.createElement("person")
    people.appendChild(aperson)
    for count in range(1,1250):
      html = getHtml(count).text
      soup= BeautifulSoup(html,"lxml")
      trs=soup.findAll("img")
      length=len(trs)
      for i in range(length):
        try:
            girlName = trs[i].attrs["title"]
            girlImage = trs[i].attrs["src"]
            webUrl ="https://www.xxx.co/xx/"+trs[i].attrs["src"].split('/')[-1][:-6]
            WirteXml(girlName,girlImage,webUrl)
        except:
           None
      print(""+str(count)+"页抓完!!!")
    f.write(doc.toprettyxml(indent="  "))
    f.close()

 

转载于:https://www.cnblogs.com/Conker/p/6820345.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值