import requests
from bs4 import BeautifulSoup
import pymysql
import time
db_config = {
'host': 'localhost',
'user': 'root',
'password': '258852f',
'database': 'spider',
'charset': 'utf8mb4'
}
def create_connection():
return pymysql.connect(**db_config)
from datetime import datetime
created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def insert_recipe_data(name, ingredients, steps, link,created_at):
connection = create_connection()
try:
with connection.cursor() as cursor:
sql = """
INSERT INTO recipes (name, ingredients, steps, link,created_at)
VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
name = VALUES(name),
ingredients = VALUES(ingredients),
steps = VALUES(steps),
link = VALUES(link)
"""
cursor.execute(sql, (name, ingredients, steps, link, created_at))
connection.commit()
finally:
connection.close()
def crawl_recipes():
base_url = 'http://www.xiachufang.com/explore/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
}
connection = create_connection()
response = requests.get(base_url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
recipes = soup.find_all('div', class_='info pure-u')
for recipe in recipes:
name = recipe.find('p', class_='name').a.get_text()
link = 'http://www.xiachufang.com' + recipe.find('p', class_='name').a['href']
ingredients = recipe.find('p', class_='ing ellipsis').get_text()
print(f'Fetching recipe details from {link}')
detail_response = requests.get(link, headers=headers)
if detail_response.status_code == 200:
detail_soup = BeautifulSoup(detail_response.text, 'html.parser')
steps = detail_soup.find_all('li', class_='container')
steps_text = '\n'.join([step.get_text() for step in steps])
created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
insert_recipe_data(name, ingredients, steps_text, link, created_at)
else:
print(f'Failed to retrieve recipe details from {link}')
time.sleep(1)
else:
print(f'Failed to retrieve data from {base_url}')
# 主程序入口
if __name__ == '__main__':
crawl_recipes()