import re
import time
import requests
from dataclasses import dataclass
import linecache
import os
def get_filter(text):
if isinstance(text, list):
text = ''.join(text)
text = str(text)
text = text.strip()
filter_list = [
'\r', '\n', '\t', '\u3000', '\xa0', '\u2002',
'<br>', '<br/>', ' ', ' ', ' ', '>>', '"',
'展开全部', ' '
]
for fl in filter_list:
text = text.replace(fl, '')
return text
def get_qtv_qtk():
api_url = 'https://fanyi.qq.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/73.0.3683.86 Safari/537.36', }
res = requests.get(api_url, headers=headers)
data = res.text
fy_guid = res.cookies.get('fy_guid')
reg = re.compile(r'var qtv = "(.*?)"')
qtv = reg.search(data).group(1)
reg = re.compile(r'var qtk = "(.*?)"')
qtk = reg.search(data).group(1)
return fy_guid, qtv, qtk
@dataclass
class SougoTrans(object):
fromlang: str
tolang: str
text: str
def __post_init__(self):
self.api_url = 'https://fanyi.qq.com/api/translate'
self.headers = {
'Cookie': 'fy_guid=605ead81-f210-47eb-bd80-ac6ae5e7a2d8; '
'qtv=ed286a053ae88763; '
'qtk=wfMmjh3k/7Sr2xVNg/LtITgPRlnvGWBzP9a4FN0dn9PE7L5jDYiYJnW03MJLRUGHEFNCRhTfrp/V+wUj0dun1KkKNUUmS86A/wGVf6ydzhwboelTOs0hfHuF0ndtSoX+N3486tUMlm62VU4i856mqw==; ',
'Host': 'fanyi.qq.com',
'Origin': 'https://fanyi.qq.com',
'Referer': 'https://fanyi.qq.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/73.0.3683.86 Safari/537.36', }
if not self.fromlang:
self.fromlang = 'auto'
if not self.tolang:
self.tolang = 'en' # 设置默认为英语
self.sessionUuid = str(int(time.time() * 1000))
self.fy_guid, self.qtv, self.qtk = get_qtv_qtk()
self.headers['Cookie'] = self.headers['Cookie'].replace(
'605ead81-f210-47eb-bd80-ac6ae5e7a2d8', self.fy_guid)
self.headers['Cookie'] = self.headers['Cookie'].replace(
'ed286a053ae88763', self.qtv)
self.headers['Cookie'] = self.headers['Cookie'].replace(
'wfMmjh3k/7Sr2xVNg/LtITgPRlnvGWBzP9a4FN0dn9PE7L5jDYiYJnW03MJLRUGHEFNCRhTfrp/V+wUj0dun1KkKNUUmS86A/wGVf6ydzhwboelTOs0hfHuF0ndtSoX+N3486tUMlm62VU4i856mqw==',
self.qtk)
def get_trans_result(self):
data = {
'source': self.fromlang,
'target': self.tolang,
'sourceText': self.text,
'qtv': self.qtv,
'qtk': self.qtk,
'sessionUuid': self.sessionUuid, }
trans_result = requests.post(
self.api_url, data=data, headers=self.headers)
try:
datas = trans_result.json()['translate']['records']
trans_result = ''.join([data['targetText'] for data in datas])
except:
print(trans_result.text)
trans_result = ''
return trans_result
def save_to_file(file_name, contents):
fh = open(file_name, 'w', encoding='utf-8')
fh.write(contents)
fh.close()
def run(filepath_name: object) -> object:
fromlang = ''
tolang = 'en'
text = ''
i = 0
for filename in os.listdir(filepath_name):
i += 1
fileout_name = './英文数据/' + str(i) + '.txt'
start = time.clock()
filename = filepath_name + '/' + filename
reqs = linecache.getlines(filename)
for a in reqs:
text = text + a
Sougou = SougoTrans(fromlang, tolang, text)
res = Sougou.get_trans_result()
save_to_file(fileout_name, res)
end = time.clock()
print('Running time: %s Seconds' % (end - start))
time.sleep(1)
if __name__ == '__main__':
# run("./原始数据")
Flag = True
a = 9115
while (Flag):
if(a <= 14927):
start = time.clock()
a = a + 1
fromlang = ''
tolang = 'en'
text = ''
reqs = linecache.getlines('./原始数据/'+str(a)+'.csv')
for i in reqs:
text = text + i
# text = get_filter(text)
Sougou = SougoTrans(fromlang, tolang, text)
res = Sougou.get_trans_result()
fileout_name = './英文数据/' + str(a) + '.txt'
save_to_file(fileout_name, res)
end = time.clock()
print('Running time: %s Seconds' % (end - start))
time.sleep(1)
else:
Flag = False
爬虫破解腾讯网页翻译翻译文档内容
最新推荐文章于 2025-06-28 21:21:36 发布