首先我是在网上搜到了一篇博客,这个代码原本是python2的,我给改成了Python3,但是出现了错误
import re
import urllib
import urllib.request
import urllib.parse
#urllib:
#urllib2: The urllib2 module defines functions and classes which help in opening
#URLs (mostly HTTP) in a complex world — basic and digest authentication,
#redirections, cookies and more.
def translate(text):
'''''模拟浏览器的行为,向Google Translate的主页发送数据,然后抓取翻译结果 '''
#text 输入要翻译的英文句子
text_1=text
#'langpair':'en'|'zh-CN'从英语到简体中文
values={'hl':'zh-CN','ie':'UTF-8','text':text_1,'langpair':"'en'|'zh-CN'"}
chaper_url='http://translate.google.cn'
#headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
data = urllib.parse.urlencode(values).encode(encoding='UTF8')
#req = urllib.request.Request(url,data,headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} )
#模拟一个浏览器
#browser='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)'
#req.add_header('User-Agent',browser)
#向谷歌翻译发送请求
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = urllib.request.Request(url=chaper_url, headers=headers)
response=urllib.request.urlopen(req).read()
response=response.decode('utf-8')
#response = urllib.request.urlopen(req)
#读取返回页面
#html=response
#从返回页面中过滤出翻译后的文本
#使用正则表达式匹配
#翻译后的文本是'TRANSLATED_TEXT='等号后面的内容
#.*? non-greedy or minimal fashion
#(?<=...)Matches if the current position in the string is preceded
#by a match for ... that ends at the current position
p=re.compile(r"(?<=TRANSLATED_TEXT=).*?;")
m=p.search(response)
text_2=m.group(0).strip(';')
return text_2
if __name__ == "__main__":
#text_1 原文
#text_1=open('c:\\text.txt','r').read()
text_1='Hello, my name is Derek. Nice to meet you! '
print('The input text: %s' % text_1)
text_2=translate(text_1).strip("'")
print('The output text: %s' % text_2)
#保存结果
filename='c:\\Translation.txt'
fp=open(filename,'w')
fp.write(text_2)
fp.close()
report='Master, I have done the work and saved the translation at '+filename+'.' print('Report: %s' % report)
运行没有错误但是就是无法翻译,找到原因,原来是data没有传进来
进行修改将
req = urllib.request.Request(url=chaper_url, headers=headers)
改为:
req = urllib.request.Request(url=chaper_url,data=data, headers=headers)
以为应该没有错误了吧!可以运行出现异常
试了很多方法仍然没有效果,还是出现以上错误,虽然没有跑通,可是从中学到了不少,我还会继续调试这个代码的。
python2与 python3语法区别
参考:http://blog.youkuaiyun.com/samxx8/article/details/21535901
源程序:
1. url='http://translate.google.cn/translate_t'
2. data = urllib.urlencode(values)
3. req = urllib2.Request(url,data)
4. #模拟一个浏览器
5. browser='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)'
6. req.add_header('User-Agent',browser)
7. #向谷歌翻译发送请求
8. response = urllib2.urlopen(req)
9. #读取返回页面
10. html=response.read()
看了一篇博客:http://www.2cto.com/kf/201309/242273.htmlPython3.x中"HTTP Error 403:Forbidden"问题的解决方案
主要是由于该网站禁止爬虫导致的,可以在请求加上头信息,伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0)Gecko/20100101 Firefox/23.0'}
req = urllib.request.Request(url=chaper_url, headers=headers)
urllib.request.urlopen(req).read()
将urllib.request.urlopen.read()替换成上面的代码后,对于出现问题的页面就可以就正常访问
改为:
values={'hl':'zh-CN','ie':'UTF-8','text':text_1,'langpair':"'en'|'zh-CN'"}
chaper_url='http://translate.google.cn'
#data = urllib.parse.urlencode(values).encode(encoding='UTF8')
#req = urllib.request.Request(url,data)
#模拟一个浏览器
#browser='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NETCLR 2.0.50727)'
#req.add_header('User-Agent',browser)
#向谷歌翻译发送请求
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0)Gecko/20100101 Firefox/23.0'}
req = urllib.request.Request(url=chaper_url, headers=headers)
response=urllib.request.urlopen(req).read()
#response = urllib.request.urlopen(req)
#读取返回页面
#html=response
原来程序:
response=urllib.request.urlopen(req).read()
html=response.read()
改正:
Html=response
然后我又从网上找到了一个Google翻译代码 http://www.jb51.net/article/46093.htm
import urllib.request
import sys
typ = sys.getfilesystemencoding()
def translate(querystr, to_l="zh", from_l="en"):
'''for google tranlate by doom
'''
C_agent = {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.165063 Safari/537.36 AppEngine-Google."}
flag = 'class="t0">'
tarurl = "http://translate.google.com/m?hl=%s&sl=%s&q=%s \
" % (to_l, from_l, querystr.replace(" ", "+"))
request = urllib.request.Request(tarurl, headers=C_agent)
page = str(urllib.request.urlopen(request).read().decode(typ))
target = page[page.find(flag) + len(flag):]
target = target.split("<")[0]
return target
#print(translate("After numerous media reports, Nike Business (China) Co., Ltd. finally issued a fourth statement to consumers yesterday:"))
这个是将英文翻译成中文的。