爬虫成语词语源码优化IndexError: list index out of range_random.choice list index out of range-优快云博客

本文链接：https://blog.youkuaiyun.com/zfs1570/article/details/110810246

本文介绍了一个使用Python爬虫抓取特定网站上的成语及其解释的过程，并将数据保存到Excel文件中。针对每个成语，程序还实现了随机抽取一个字进行组词的功能，通过网络请求获取相关词汇。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Traceback (most recent call last):
File "D:\eclipse\works\pytest\pytest\test\cytest.py", line 60, in <module>
jieinfo=str(jieinfo[0])
IndexError: list index out of range

这个异常就是list的index下标值超出范围。。。

主要优化内容：组词没有得到5个情况下pass，因为我选项要得到5个组词就必须匹配到五个正则，如果没有五个的话，正则获取index下标值为4取第五个肯定程序会报错，没法继续执行下去了。

上面也同时解决了手动去添加过滤某些成语的额情况

'''
Created on 2020年12月1日

@author: foshion888
'''
import random
import re
import time
import urllib.request

import xlwt.Workbook

def chengyu(url):
try:
pcres=urllib.request.urlopen(url)
pcres=pcres.read().decode('gbk')
return pcres
#异常处理
except urllib.error.HTTPError as reason:
print(reason)

if __name__ == '__main__':
zimu=["D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
#遍历字母
for zm in zimu:
#遍历数字
rundetail_excel=xlwt.Workbook()
for num in range(3,19):
print(zm+str(num))
url="http://xxx.com/list/"+zm+"_"+str(num)+".html"
res=chengyu(url)
cyall=re.findall(r'<li><a href="/cy(\d+)/(\d+).html">(.+?)</a></li>',str(res))
sheetname=zm+str(num)
worksheet=rundetail_excel.add_sheet(sheetname,cell_overwrite_ok=True)
worksheet.write(0,0,"idiom_name")
worksheet.write(0,1,"idiom_word1")
worksheet.write(0,2,"idiom_word2")
worksheet.write(0,3,"idiom_word3")
worksheet.write(0,4,"idiom_word4")
worksheet.write(0,5,"idiom_lack")
worksheet.write(0,6,"idiom_explan")
worksheet.write(0,7,"idiom_option")
#遍历每个成语
for cylen in range(len(cyall)):
time.sleep(1)
print("编号",cylen)
if len(cyall[cylen][2])==4:
cyname=cyall[cylen][2]
cypage=cyall[cylen][0]
cynum=cyall[cylen][1]
print("成语:",cyname)
worksheet.write(cylen+1,0,cyname)
jieshiurl="http://xxxcom/cy"+str(cypage)+"/"+str(cynum)+".html"
jieres=chengyu(jieshiurl)
jieinfo=re.findall('释义</td>(\s|[\r\n])+<td>(.+)</td>',str(jieres))
jieinfo=str(jieinfo[0])
jieinfo=jieinfo[8:-2]
worksheet.write(cylen+1,6,jieinfo)
worksheet.write(cylen+1,1,cyname[0])
worksheet.write(cylen+1,2,cyname[1])
worksheet.write(cylen+1,3,cyname[2])
worksheet.write(cylen+1,4,cyname[3])
randlack=[cyname[0],cyname[1],cyname[2],cyname[3]]
randint=[1,2,3,4]
intnum=random.choice(randint)
time.sleep(1)
#答案第一个组词第二个
if intnum==1:
worksheet.write(cylen+1,5,cyname[0])
uni = cyname[1].encode('unicode-escape').decode()
uni=uni[2:]
url="http://www.xxx.com/zuci-"+str(uni)+"/"
pcres=urllib.request.urlopen(url)
pcres=pcres.read()
res=pcres.decode('utf-8', 'ignore')
serinfo=re.findall(r'target="_blank">([\u4e00-\u9fa5])'+str(cyname[1])+'</a>',str(res))
if len(serinfo)>=5:
resfive=serinfo[0]+serinfo[1]+serinfo[2]+serinfo[3]+serinfo[4]
worksheet.write(cylen+1,7,resfive)
#答案第二个组词第一个
elif intnum==2:
worksheet.write(cylen+1,5,cyname[1])
uni = cyname[0].encode('unicode-escape').decode()
uni=uni[2:]
url="http://www.xxx.com/zuci-"+str(uni)+"/"
pcres=urllib.request.urlopen(url)
pcres=pcres.read()
res=pcres.decode('utf-8', 'ignore')
serinfo=re.findall(r'target="_blank">'+str(cyname[0])+'([\u4e00-\u9fa5])</a>',str(res))
if len(serinfo)>=5:
resfive=serinfo[0]+serinfo[1]+serinfo[2]+serinfo[3]+serinfo[4]
worksheet.write(cylen+1,7,resfive)
#答案第三个组词第四个
elif intnum==3:
worksheet.write(cylen+1,5,cyname[2])
uni = cyname[3].encode('unicode-escape').decode()
uni=uni[2:]
url="http://www.xxx.com/zuci-"+str(uni)+"/"
pcres=urllib.request.urlopen(url)
pcres=pcres.read()
res=pcres.decode('utf-8', 'ignore')
serinfo=re.findall(r'target="_blank">([\u4e00-\u9fa5])'+str(cyname[3])+'</a>',str(res))
if len(serinfo)>=5:
resfive=serinfo[0]+serinfo[1]+serinfo[2]+serinfo[3]+serinfo[4]
worksheet.write(cylen+1,7,resfive)
#答案第四个组词第三个
elif intnum==4:
worksheet.write(cylen+1,5,cyname[3])
uni = cyname[2].encode('unicode-escape').decode()
uni=uni[2:]
url="http://www.xxx.com/zuci-"+str(uni)+"/"
pcres=urllib.request.urlopen(url)
pcres=pcres.read()
res=pcres.decode('utf-8', 'ignore')
serinfo=re.findall(r'target="_blank">'+str(cyname[2])+'([\u4e00-\u9fa5])</a>',str(res))
if len(serinfo)>=5:
resfive=serinfo[0]+serinfo[1]+serinfo[2]+serinfo[3]+serinfo[4]
worksheet.write(cylen+1,7,resfive)