正则表达式与数据库操作-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_60307674/article/details/119321501

根据视频进度，继续学习

1.Re 正则表达式

# coding:utf-8
# @Time:2021/8/2 10:37
# @Author:YuFei
# @File: testRe.py
# @Software:PyCharm

import re


# 搜索 验证 比对
# re 字符串的模式,规定字符串  判读字符串是否符合格式要求
# . 表示任何单个字符
# [] 字符集，对单个字符给出取值范围  [^] 非字符集，非取值范围
# * 前个字符 0次或无限次扩展 +前1个字符1次或无限次扩展
# ？ 前一个字符0次或1次扩展  | 左右表达式任何一个
# {m} 前一个字符m次 {m,n} 扩展前一个字符m-n次
# ^ 匹配字符串开头 $ 匹配字符串结尾
# () 分组标记, 内部只能使用 | 操作符
# \d 数字 = [0-9]  \w 单词字符 =[A-Za-z0-9_]

# 配置模式 re.l 大小写不敏感 re.L 使本地化识别匹配
# re.M 多行匹配,影响^ $ re.S 使.匹配包括换行在内的所有字符
# re.U 根据Unicode字符集解析字符, 影响 \w \b \W \B
# re.X 更灵活的格式将正则表达式写的更加易于理解

# search compile对象 用search比对 找出第一个
# 创建模式对象
pet = re.compile("AA")  # 此处AA为正则表达式 用来验证其他字符串
# m = pet.search("ABCAA")  # 被校验的字符串
# span(3,5)  左闭右开
m = pet.search("AABCAAAHGFSFZZAA")
print(m)
# 没有模式对象
m = re.search("asd", "Aasd")  # 前面的字符串是规则,后面是校验的字符串
print(m)
# findall
# m = re.findall("a", "ASDaDFGAa")  # 前面的字符串是规则,后面是校验的字符串
# print(m)
# m = re.findall("[A-Z]", "ASDaDFGAa")
# print(m)
m = re.findall("[A-Z]+", "ASDaDFGAa")
print(m)
# sub 分割 替换
print(re.sub("a", "A", "abcdefg"))  # 替换被替换对象
# 建议在正则表达式中,被比较的字符串前加上r 不用担心转义字符的问题
a = r"\aabd-\'"
print(a)

2.Xlwt excel写入练习

# coding:utf-8
# @Time:2021/8/3 12:24
# @Author:YuFei
# @File: testXwlt.py
# @Software:PyCharm

import xlwt

workbook = xlwt.Workbook(encoding="utf-8")  # 创建workbook对象
worksheet = workbook.add_sheet("sheet1")  # 创建工作表
# worksheet.write(0, 0, "hello")  # 写入数据 矩阵 先行后列+内容
n = 1
while n < 10:
    for i in range(1, n+1):
        print('%d * %d = %d' %(n, i, n * i), end=' ')
        worksheet.write(n-1, i-1, '%d * %d = %d' %(n, i, n * i))  # 写入数据 矩阵 先行后列+内容
        i += 1
    print()
    n += 1
workbook.save("student.xls")

3.数据库操作 Sqlite3

# coding:utf-8
# @Time:2021/8/3 12:59
# @Author:YuFei
# @File: testSqlite.py
# @Software:PyCharm

import sqlite3

# 1.连接数据库
# conn = sqlite3.connect("test.db")  # 后缀名无所谓,db表示是数据库 默认路径  打开或创建数据库文件
# print("Openned database sucessfully")

# 建表
# conn = sqlite3.connect("test.db")
# print("成功打开数据库")
# c = conn.cursor()  # 获取游标
# sql = """
#     create table company
#         (id int primary key not null,
#         name text not null,
#         age int not null,
#         address char(52),
#         salary real);
# """
# c.execute(sql)  # 执行Sql
# conn.commit()  # 提交数据库操作
# conn.close()  # 关闭连接
# print("成功建表")
# 插入数据
# conn = sqlite3.connect("test.db")
# print("成功打开数据库")
# c = conn.cursor()  # 获取游标
# sql1 = """
#     insert into company(id, name, age, address, salary)
#     values(1, '张三', 32, '成都', 8000);
# """
# sql2 = """
#     insert into company(id, name, age, address, salary)
#     values(2, '阿达', 32, '大', 80000);
# """
# c.execute(sql1)  # 执行Sql
# c.execute(sql2)
# conn.commit()  # 提交数据库操作
# conn.close()  # 关闭连接
# print("插入完毕")
# 查询数据
conn = sqlite3.connect("test.db")
print("成功打开数据库")
c = conn.cursor()  # 获取游标
sql = "select id, name, address, salary from company"
cursor = c.execute(sql)
for row in cursor:
    print("id= ", row[0])
    print("name= ", row[1])
    print("address= ", row[2])
    print("salary= ", row[3], '\n')
conn.close()  # 关闭连接
print("查询完毕")

4.爬取保存至数据库

# coding:utf-8
# @Time:2021/7/28 14:23
# @Author:YuFei
# @File: 16.spider.py
# @Software:PyCharm
import time

from bs4 import BeautifulSoup  # 网页解析 获取数据
import re  # 正则表达式 文字匹配
import urllib.request, urllib.error  # 制定url-获取网页数据
import xlwt  # 进行excel操作 存进excel
import sqlite3  # 进行SQLite数据库操作
import urllib  # 获取数据


def main():
    # 访问链接-爬取网页
    base_url = "https://movie.douban.com/top250?start=0"
    datalist = get_data(base_url)
    # save
    # save_path = "豆瓣电影Top250.xls"
    dbpath = "movie.db"
    save_data2(datalist, dbpath)
    # save_data(datalist, save_path)
    # ask_url("https://movie.douban.com/top250?start=0")


findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S)  # 忽视换行符
findTitle = re.compile(r'<span class="title">(.*?)</span>')
findLink = re.compile(r'<a href="(.*?)">')  # 创建正则表达式对象 字符串的规则
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findInq = re.compile(r'<span class="inq">(.*)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)  # 相关内容


# 爬取网页
def get_data(base_url):
    datalist = []
    for i in range(0, 10):  # 左闭右开 获取页面信息
        url = base_url + str(i * 25)
        html = ask_url(url)  # 保存每一页的网页源码
        # 解析数据--逐一解析
        soup = BeautifulSoup(html, "html.parser")  # 形成树形结构的文档
        for item in soup.find_all('div', class_="item"):  # 查找符合要求的字符串,形成列表
            data = []  # 电影的数据
            item = str(item)
            link = re.findall(findLink, item)[0]  # re库用来查找指定字符串
            data.append(link)
            imgSrc = re.findall(findImgSrc, item)[0]
            data.append(imgSrc)
            titles = re.findall(findTitle, item)  # 片名可能只有一个中文名
            if len(titles) == 2:
                ctitle = titles[0]
                data.append(ctitle)
                otitle = titles[1].replace('/', '')  # 去掉无关的符号
                data.append(otitle)
            else:
                data.append(titles)
                data.append('')  # 留空 防止串位
            rating = re.findall(findRating, item)[0]
            data.append(rating)
            judgeNum = re.findall(findJudge, item)[0]
            data.append(judgeNum)
            inq = re.findall(findInq, item)
            if len(inq) != 0:
                data.append(inq[0].replace('。', ''))
            else:
                data.append('')
            bd = re.findall(findBd, item)[0]
            bd = re.sub('<br(\s+)?/>(\s+)?', '', bd)
            bd = re.sub('\a0', '', bd)
            bd = re.sub('/', '', bd)
            data.append(bd.strip())  # 去掉前后的空格
            datalist.append(data)
            # time.sleep(1)
    return datalist


# 得到指定的一个url网页内容
def ask_url(url):
    head = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
    }  # 用户代理 告诉豆瓣服务器我们是什么类型的机器和浏览器 本质上是告诉浏览器我们能接受什么水平
    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html


# 保存数据
def save_data(datalist, save_path):
    book = xlwt.Workbook(encoding="utf-8")  # 创建workbook对象
    sheet = book.add_sheet("豆瓣电影Top250", cell_overwrite_ok=True)  # 创建工作表
    col = ("电影详情链接", "图片", "影片中文名", "影片外国名", "评分", "评价数", "概况", "相关信息")
    for i in range(0, 8):
        sheet.write(0, i, col[i])
    for i in range(0, 250):
        print("第%d条" %(i+1))
        data = datalist[i]
        for j in range(0, 8):
            sheet.write(i+1, j, data[j])
    book.save(save_path)


def save_data2(datalist, dbpath):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()
    for data in datalist:
        for index in range(len(data)):
            if index == 4 or index == 5:
                continue
            data[index] = '"' + str(data[index]) + '"'
        sql = '''
            insert into movie250 (info_link, pic_link, cname, ename, score, rated, intruductions, info)
            values(%s)''' %",".join('%s' %data for data in data)
        print(sql)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()


def init_db(dbpath):
    sql = """
        create table movie250
        (id integer primary key  autoincrement,
        info_link text,
        pic_link text,
        cname varchar,
        ename varchar,
        score numeric,
        rated numeric,
        intruductions text,
        info text)
    """  # 创建数据表
    conn = sqlite3.connect(dbpath)
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()


# 程序入口 主方法入口处--当程序执行时，从此执行语句
if __name__ == "__main__":
    # init_db("movietest.db")
    main()