步骤
1. 用浏览器显示html
2. 截屏
3. 后处理,截取有效部分
代码
import os
import pyppeteer
from pyppeteer import launch
import asyncio
import cv2
async def get_jpgs(html_path, img_path):
browser = await launch()
page = await browser.newPage()
await asyncio.wait_for(page.goto('file://'+ html_path), timeout=30)
await page.screenshot({'path': img_path})
# 关闭浏览器
await page.close()
await browser.close()
def postprocess(img_path):
img = cv2.imread(img_path)
height, width, channel = img.shape
for i in range(width):
if img[:,i,:].min() < 255:
min_x = i
break
j = width-1
while(j>0):
if img[:,j,:].min() < 255:
max_x = j
break
j -= 1
for k in range(height):
if img[k,:,:].min() < 255:
min_y = k
break
l = height - 1
while(l>0):
if img[l,:,:].min() < 255:
max_y = l
break
l -= 1
img_table = img[min_y-2:max_y+2, min_x-2:max_x+2,:]
cv2.imwrite(img_path, img_table)
if __name__ == "__main__":
import glob
html_path_list = glob.glob('/home/mi/下载/不合格/不合格/htmls/*.html')
# html_path必须为绝对路径
for html_path in html_path_list:
img_path = html_path.replace("htmls", "imgs").replace(".html", ".jpg")
asyncio.get_event_loop().run_until_complete(get_jpgs(html_path, img_path))
postprocess(img_path)
效果图
输入的html内容如下: