Python网络爬虫对知乎首页进行爬取

最新推荐文章于 2024-04-29 10:03:11 发布

L-yh

最新推荐文章于 2024-04-29 10:03:11 发布

阅读量2.2k

点赞数

分类专栏： Pyhton学习文章标签： Python

本文链接：https://blog.youkuaiyun.com/u012599619/article/details/46891207

版权

# -*- coding: UTF-8 -*-

import urllib, urllib2, cookielib, re, time, os

import requests

print 'zhihu_QA'
print 'Please input your email:'
MyEmail = raw_input()

print 'Please input your password:'
MyPassWord = raw_input()

print '---------------------------------------------'

#基本信息
Url = 'http://www.zhihu.com/login'
#User_Agent每个电脑都不一样
User_Agent = **********************************************'
#MyReferer = 'http://www.zhihu.com/'
MyValues = {'email' : MyEmail, 'password' : MyPassWord}
MyHeaders = {'User-Agent' : User_Agent, }


MyRequests = requests.session()
MyCont = MyRequests.post(Url, data = MyValues, headers = MyHeaders)
MyCont2 = MyCont.text.encode('UTF-8')
print MyCont2


def GetNowTime():
	return time.strftime("%Y-%m-%d_%H-%M-%S",time.localtime(time.time()))

WordPathName = r'D:\\MyZhiHu' + GetNowTime() + '.docx'

String1 = 'h2'
String2 = 'div'
from bs4 import BeautifulSoup

MySoup1 = BeautifulSoup(MyCont2)



def has_need(tag):
	return tag.has_attr('class') and tag.has_attr('feed-item-a') and tag.has_attr('data-type')
def has_need_photo(tag):
	return tag.has_attr('class') and tag.has_attr('src')

QuestionNum = 1

from docx import Document
from docx.shared import Inches


MyPhotoPath =