一、生成题库并处理图片地址
import requests
from DrissionPage._pages.chromium_page import ChromiumPage
page = ChromiumPage()
import json
import re
import pandas as pd
list = []
for i in range(1):
page.listen.start('kaoshiti/')
page.get('https://www.jsyks.com/kms-fzks')
resp = page.listen.wait()
info = resp.response.body
kaoshitiku = re.findall('var ExamCnts = (.*?);',info)[0]
json_data = json.loads(kaoshitiku)
for t in json_data:
dic = {}
tm= t['tm']
dic['tm'] = tm
dic['dalist'] = t['da']
url = t['tv']
tm1=tm[:10]
print(tm1)
if url == '/tkimg_files/source/':
dic['newurl'] = '无'
else:
url_split = url.split('/')[-1]
if url_split.endswith('.jpg'):
newurl = 'https://tkimg.mnks.cn/i/' + url_split.replace('jpg', 'webp/jsyks')
res = requests.get(newurl)
open(f'./img/{tm1}.jpg',"wb").write(res.content)
dic['filepath'] = f'file:///D:\PycharmProjects\pythonProject2024\PYTHON基础知识\DAY 15 自动答题系统\img\{tm1}.jpg'
dic['newurl'] = newurl
elif url_split.endswith('.gif'):
newurl = 'https://tkimg.mnks.cn/i/' + url_split.replace('gif', 'webp/jsyks')
dic['newurl'] = newurl
res = requests.get(newurl)
open(f'./img/{tm1}.jpg', "wb").write(res.content)
dic['filepath'] = f'file:///D:\PycharmProjects\pythonProject2024\PYTHON基础知识\DAY 15 自动答题系统\img\{tm1}.jpg'
list.append(dic)
# print(list)
df=pd.DataFrame(list)
df.to_excel("科目四考试题库03.xlsx")
二、数据处理相同的题目保留第一个
import pandas as pd
# 读取Excel文件
df = pd.read_excel("科目四考试题库03.xlsx")
# 找出重复的行(不包括第一列)
# df.drop_duplicates(df.columns[1:])
duplicates = df.duplicated(df.columns[1:])
# 删除重复行,保留第一个出现的行
df = df.loc[~duplicates]
# 将结果写回Excel文件
df.to_excel('科目四考试题库04.xlsx', index=False)
1万+

被折叠的 条评论
为什么被折叠?



