# encoding:utf-8
import sys
import re
import pymssql
from urllib2 import Request, urlopen, URLError, HTTPError
def get_packet(url):
packet = urlopen(url)
content = packet.read()
return content
def get_data(packet):
global items
global items2
xiangmu = '~'
tmp = re.findall(r'<a href.*>(.*)</a></th>',packet)
tmp2 = re.findall('<td width="135">(.*)</td>',packet)
if tmp is not None:
items = tmp
if tmp2 is not None :
items2 = [x for x in tmp2 if '&' not in x]
def save_data():
server = "A853F747DDE4458\SQLEXPRESS"
user = "sa"
password = "123456"
conn = pymssql.connect(server, user, password, "stock")
cursor = conn.cursor()
sql = "INSERT INTO assets_and_liabilities VALUES (%d, %s, %s, %s)"
j=2;
for i in range(0,len(items)):
cursor.execute(sql,(ID, items2[0], items[i] ,items2[i+j]))
j=j+1
# you must call commit() to persist your data if you don't set autocommit to True
conn.commit()
conn.close()
if __name__=='__main__':
url = 'http://stock.finance.qq.com/corp1/cbsheet.php?zqdm=600787&type=2014'
packet = get_packet(url)
items = []
items2 = []
ID = 600787
if packet =='~':
sys.exit(0)
get_data(packet)
save_data()
此次代码更新了以下内容:
1、对抓取到的数据进行处理,用语句
if tmp2 is not None :
items2 = [x for x in tmp2 if '&' not in x]
对“
”这个数据作丢弃处理。新生成一个列表赋值给items2.
2、增加写入数据库的语句,把数据批量插入数据库表中,通过导入pymssql库,即可用python操作mssql数据库。pymssql下载地址
3、修正items和items2为全局变量,在函数中加global items即可,否则在get_data函数中给items和items2赋值后,运行到save_data函数时,items和items2仍然是初始值。get_data的赋值会不起作用。(此问题困扰了好久,真失败。)