import cv2
import pytesseract
import numpy as np
import os
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
def extract_features(image_path):
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
text = pytesseract.image_to_string(gray)
text_len = len(text)
color_hist = cv2.calcHist([gray], [0], None, [256], [0,256])
color_hist = color_hist.flatten()
total_pixels = np.prod(img.shape[:2])
return text_len, color_hist, total_pixels
def is_webpage_screenshot(image_path, min_text_length=100, max_text_density=0.05):
text_len, _, total_pixels = extract_features(image_path)
if text_len < min_text_length:
return False
text_density = text_len / total_pixels
if text_density > max_text_density:
return False
return True
source_dir = "data/aigc_0"
dest_dir = "data/aigc_0_non"
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
image_paths = [os.path.join(source_dir, filename) for filename in os.listdir(source_dir)]
num_threads = 10
with ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = {executor.submit(is_webpage_screenshot, path): path for path in image_paths}
for future in tqdm(as_completed(futures), total=len(image_paths), desc="Processing images"):
path = futures[future]
try:
result = future.result()
except Exception as e:
print(f"An error occurred while processing {path}: {e}")
else:
if not result:
shutil.move(path, dest_dir)