(2)抓取系统
common_urllib.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import urllib
import urllib2
import traceback
import json
import logging
import types
import re
import common_logging
logger = logging.getLogger()
def get(url):
content = u''
try:
status = urllib.urlopen(url)
if status.getcode() == 200:
content = status.read()
content = unicode(content.strip(), 'utf-8', 'ignore')
else:
logger.error('fetch error [%s]' % url)
except:
logger.error('fetch error %s' % traceback.format_exc())
return content
def post(url, dict_data={}):
content = u''
try:
data = urllib.urlencode(dict_data)
req = urllib2.Request(url, data)
status = urllib2.urlopen(req)
if status.getcode() == 200:
content = status.read()
content = unicode(content.strip(), 'utf-8', 'ignore')
else:
logger.error('fetch error [%s]' % url)
except:
logger.error('%s' % traceback.format_exc())
return content
def post_content(url, data):
content = u''
try:
req = urllib2.Request(url, data)
status = urllib2.urlopen(req)
if status.getcode() == 200:
content = status.read()
else:
logger.error('fetch error [%s]' % url)
except:
logger.error('%s' % traceback.format_exc())
return content
if __name__ == '__main__':
content = get("http://www.sina.com")
print len(content)