主要问题:
1.爬到的信息有限
2.getstatus已经完成了,原因是页面没有重复获取,放到循环外面去了(Orz)
3.欢迎大佬们测试留言
import re
import requests
import time
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
class Acauto(object):
def __init__(self):
object.__init__(self)
self.session = requests.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
self.session.headers.update(headers)
def login(self, username, password):
url = 'http://acm.hdu.edu.cn/userloginex.php?action=login'
data = {
'username': username,
'userpass': password,
'login': 'Sign In',
}
headers = {
'host': 'acm.hdu.edu.cn',
'origin': 'http://acm.hdu.edu.cn',
'referer': 'http://acm.hdu.edu.cn/'
}
r = self.session.post(url, data=data, headers=headers)
def getstatus(self, problemID): #新加功能
status_url = 'http://acm.hdu.edu.cn/status.php?user=zzuliauto2'
while True :
time.sleep(1)
req = self.session.get(status_url)
soup = BeautifulSoup(req.text,'lxml')
for i in soup.table.find_all('table')[-2].find_all('tr'):
ans = i.find_all('td')
if ans[3].string == str(problemID):
dan = ans[2].string
if (dan != 'Queuing' and dan != 'Compiling' and dan!='Running'):
print (dan)
return
break
def submit(self, problemID, code, language=0):
url = 'http://acm.hdu.edu.cn/submit.php?action=submit'
code = code.encode('utf-8').decode()
data = {
'check': '0',
'problemid': str(problemID),
'language': str(language),
'usercode': code
}
headers = {
'Connect-Type': 'application/x-www-form-urlencoded'
}
print('submitting problem: ', problemID)
r = self.session.post(url, data=data, headers=headers)
c.getstatus(problemID)
def getsolved(self, username):
url = 'http://acm.hdu.edu.cn/userstatus.php?user=%s' % username
solved = []
r = self.session.get(url)
# 解析出含有所有已完成题目号的字符串solvedstr
soup = BeautifulSoup(r.text, 'html.parser')
result = soup.find('p', align='left')
solvedstr = result.text.split(';')
# 从solvedstr中解析出一个list,含有所有完成题目号码
for item in solvedstr:
if item:
item = re.search(r'\d{4}', item) # 匹配4个数字
solved.append(item.group(0))
return solved
def getbaidu(self, problemID):
solutions = []
solutionurls = []
url = r'http://www.baidu.com/s?wd=hdu%20' + str(problemID) # 用题号拼接url
baidusession = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}
baidusession.headers.update(headers)
r = baidusession.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
res = soup.find_all('a', attrs={'target': '_blank', 'class': 'c-showurl',
'style': 'text-decoration:none;'})
for item in res:
if re.match('blog.youkuaiyun.com', item.text):
solutionurls.append(item['href'])
for item in solutionurls:
r = baidusession.get(item)
soup = BeautifulSoup(r.text, 'html.parser')
code = soup.find(attrs={'name': 'code', 'class': 'cpp'})
if code:
# 先验证博客标题,如果标题包含题号,则继续
title = soup.find('span', class_='link_title')
if title == None: #异常:至今不懂为何NoneType
break
if title.text==None:#异常:至今不懂为何NoneType
break
pos = (title.text).find(str(problemID))
if pos == -1: # 若果不包含题号,break
break
solutions.append(code.text)
print(problemID, 'solutions finded: ', len(solutions))
return solutions
def acrush(self, start=1000, end=5932, interval=10):
language = 0
for problemID in range(start, end):
# 判断这个题是否被我ac
if str(problemID) not in c.getsolved(user):
print(problemID, 'is not AC, start solving it...')
# 解决这道没有AC的题目
answers = c.getbaidu(problemID)
if answers:
for answer in answers:
if str(problemID) not in c.getsolved(user):
# 判断是否为:C++
if answer.find('iostream') != -1:
language=2
elif answer.find('cstdio') != -1:
language=2
elif answer.find('stdio.h') != -1:
language=0
else:
print('language=???')
continue
print('language=', language)
c.submit(problemID, answer, language=language)
time.sleep(interval)
else:
break
if __name__ == '__main__':
c = Acauto()
user = 'zzuliauto2'
password = '19951106'
startID = 1010
c.login(user, password)
c.acrush(1010)