我写的第一个用Python写的爬虫

本文介绍了一个使用Python进行网页抓取的实例,包括了请求头设置、MD5加密、GZIP解压等关键技术点。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import time
import hashlib
import gzip
import sys
import re
import urllib.request
import urllib.parse
import urllib.error
import http.cookiejar
from bs4 import BeautifulSoup
import datetime
import random
import re




def ungzip(data):
    try:
        print("正在解压.....")
        data = gzip.decompress(data)
        print("解压成功")
    except:
        print("未经压缩,无需解压")
    return data








t=time.time()
millis = int(round(t*1000))
def md5(s):
    m = hashlib.md5()
    m.update(s.encode(encoding='utf-8'))
    return m.hexdigest()
print(md5('1484740695133'))
print(millis)


LoginUrl = "http://erp.sciyon.com:9090/CheckIn/GetUpdateConfig.aspx"


headers = {
    'Charset':'UTF-8',
    'User-Agent':'Mozilla/5.0 (Linux; U; Mobile; Android 5.1.1;MI 4S Build/FRF91',
    'Referer':'http://10.88.10.15/jsFrame/jsFrame/login.aspx?login=login',
    'Accept':'*/*',
    'Connection':'Keep-Alive',
    'Accept-Encoding':'gzip, deflate',
    'appverify':'md5=440cb37c164691c80716b34a39e7e335;ts=1484740695133',
    'x-mas-app-id':'aaald10031',
    'Content-Type':'application/x-www-form-urlencoded',
    'Host':'erp.sciyon.com:9090'
    }
platForm = 'android'


postDict = {
    'platForm':platForm
    }




postData=urllib.parse.urlencode(postDict).encode()




request = urllib.request.Request(LoginUrl, data=postData, headers=headers)




try:
    #模拟浏览器发送请求,并获取返回结果
    response = urllib.request.urlopen(request)
    #将返回结果解压
    response = ungzip(response.read())
    #将返回结果解码
    page = response.decode()
    print(page)


except urllib.error.URLError as e:
    print(e.code,':',e.reason)


#----------------------------------
LoginUrl = "http://网址.com:9090/CheckIn/LoginProxy.aspx?userID=111&password=123"


headers = {
    'appverify':'md5=440cb37c164691c80716b34a39e7e335;ts=1484740695133',
    'Charset':'UTF-8',
    'x-mas-app-id':'aaald10031',
    'User-Agent':'Mozilla/5.0 (Linux; U; Mobile; Android 5.1.1;MI 4S Build/FRF91',
    'Referer':'http://10.88.10.15/jsFrame/jsFrame/login.aspx?login=login',
    'Accept':'*/*',
    'Connection':'Keep-Alive',
    'Accept-Encoding':'gzip, deflate',
    'Host':'erp.sciyon.com:9090'
    }


cookie_filename = 'cookiesss.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
cookie_support = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(cookie_support)


try:
    #模拟浏览器发送请求,并获取返回结果
    response = opener.open(LoginUrl)
    #将返回结果解压
    response = ungzip(response.read())
    #将返回结果解码
    page = response.decode()
    print(page)
except urllib.error.URLError as e:
    print(e.code,':',e.reason)


cookie.save(ignore_discard=True, ignore_expires=True)
print(cookie)


#--------------------------
'''
t_url = "http://erp.sciyon.com:9090/CheckIn/CheckAppProxy.ashx"
t_postDict = {
    'Action':'CHECKIN',
    'platForm':'android',
    'PNO':'123',
    'SIM':'64:cc:2e:73:1c:8f',
    'LNG':'118.851191',
    'LAT':'31.897862'
    }
t_postData=urllib.parse.urlencode(t_postDict).encode()
try:
    #模拟浏览器发送请求,并获取返回结果
    response = opener.open(t_url,t_postData)
    #将返回结果解压
    response = ungzip(response.read())
    #将返回结果解码
    page = response.decode()
    print(page)
except urllib.error.URLError as e:
    print(e.code,':',e.reason)
'''


opener.close()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值