用python3实现HDU爬虫(后续可能更新VJ)2016.11.4更新

主要问题:

1.爬到的信息有限

2.getstatus已经完成了,原因是页面没有重复获取,放到循环外面去了(Orz)

3.欢迎大佬们测试留言

import re
import requests
import time
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup

class Acauto(object):
    def __init__(self):
        object.__init__(self)
        self.session = requests.Session()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
        }
        self.session.headers.update(headers)

    def login(self, username, password):
        url = 'http://acm.hdu.edu.cn/userloginex.php?action=login'

        data = {
            'username': username,
            'userpass': password,
            'login': 'Sign In',
        }
        headers = {
            'host': 'acm.hdu.edu.cn',
            'origin': 'http://acm.hdu.edu.cn',
            'referer': 'http://acm.hdu.edu.cn/'
        }
        r = self.session.post(url, data=data, headers=headers)

    def getstatus(self, problemID): #新加功能
        status_url = 'http://acm.hdu.edu.cn/status.php?user=zzuliauto2'
        while True :
            time.sleep(1)
            req = self.session.get(status_url)
            soup = BeautifulSoup(req.text,'lxml')
            for i in soup.table.find_all('table')[-2].find_all('tr'):
                ans = i.find_all('td')
                if ans[3].string == str(problemID):
                    dan = ans[2].string
                    if (dan != 'Queuing' and dan != 'Compiling' and dan!='Running'):
                        print (dan)
                        return
                    break




    def submit(self, problemID, code, language=0):
        url = 'http://acm.hdu.edu.cn/submit.php?action=submit'
        code = code.encode('utf-8').decode()
        data = {
            'check': '0',
            'problemid': str(problemID),
            'language': str(language),
            'usercode': code
        }
        headers = {
            'Connect-Type': 'application/x-www-form-urlencoded'
        }
        print('submitting problem: ', problemID)
        r = self.session.post(url, data=data, headers=headers)
        c.getstatus(problemID)

    def getsolved(self, username):
        url = 'http://acm.hdu.edu.cn/userstatus.php?user=%s' % username
        solved = []
        r = self.session.get(url)
        # 解析出含有所有已完成题目号的字符串solvedstr
        soup = BeautifulSoup(r.text, 'html.parser')
        result = soup.find('p', align='left')
        solvedstr = result.text.split(';')
        # 从solvedstr中解析出一个list,含有所有完成题目号码
        for item in solvedstr:
            if item:
                item = re.search(r'\d{4}', item)    # 匹配4个数字
                solved.append(item.group(0))
        return solved

    def getbaidu(self, problemID):
        solutions = []
        solutionurls = []
        url = r'http://www.baidu.com/s?wd=hdu%20' + str(problemID)      # 用题号拼接url

        baidusession = requests.Session()
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}
        baidusession.headers.update(headers)
        r = baidusession.get(url)

        soup = BeautifulSoup(r.text, 'html.parser')
        res = soup.find_all('a', attrs={'target': '_blank', 'class': 'c-showurl',
                                  'style': 'text-decoration:none;'})
        for item in res:
            if re.match('blog.youkuaiyun.com', item.text):
                solutionurls.append(item['href'])

        for item in solutionurls:
            r = baidusession.get(item)
            soup = BeautifulSoup(r.text, 'html.parser')
            code = soup.find(attrs={'name': 'code', 'class': 'cpp'})
            if code:
                # 先验证博客标题,如果标题包含题号,则继续
                title = soup.find('span', class_='link_title')
                if title == None:  #异常:至今不懂为何NoneType
                    break
                if title.text==None:#异常:至今不懂为何NoneType
                    break
                pos = (title.text).find(str(problemID))
                if pos == -1:     # 若果不包含题号,break
                    break
                solutions.append(code.text)

        print(problemID, 'solutions finded: ', len(solutions))
        return solutions


    def acrush(self, start=1000, end=5932, interval=10):
        language = 0
        for problemID in range(start, end):
            # 判断这个题是否被我ac
            if str(problemID) not in c.getsolved(user):
                print(problemID, 'is not AC, start solving it...')
                # 解决这道没有AC的题目
                answers = c.getbaidu(problemID)
                if answers:
                    for answer in answers:
                        if str(problemID) not in c.getsolved(user):
                            # 判断是否为:C++
                            if answer.find('iostream') != -1:
                                language=2
                            elif answer.find('cstdio') != -1:
                                language=2
                            elif answer.find('stdio.h') != -1:
                                language=0
                            else:
                                print('language=???')
                                continue
                            print('language=', language)
                            c.submit(problemID, answer, language=language)
                            time.sleep(interval)
                        else:
                            break

if __name__ == '__main__':
    c = Acauto()
    user = 'zzuliauto2'
    password = '19951106'
    startID = 1010
    c.login(user, password)
    c.acrush(1010)



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Kelisita

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值