tarfile.ReadError: not a gzip file / download_20newsgroups 数据集失败

本文介绍了一种改进20NewsGroups数据集加载的方法,通过自定义函数_fetch_20newsgroups替代原有fetch_20newsgroups,并实现了数据集的下载、处理与缓存。该方法增强了数据加载的灵活性,同时提供了更丰富的数据处理选项。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

改进方法:

dataset = fetch_20newsgroups(categories=categories)
改为:
dataset = _fetch_20newsgroups(categories=categories)

并添加方法:

import shutil

import matplotlib as mpl
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_header, strip_newsgroup_footer, strip_newsgroup_quoting
from sklearn.utils import check_random_state

mpl.use('Agg')
import os
import pickle
import codecs
import math
import networkx as nx
import pickle as pkl
import numpy as np
from itertools import product
from sklearn.datasets import fetch_20newsgroups, load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import kneighbors_graph
from sklearn.manifold import TSNE
from tqdm import tqdm
import tarfile

from matplotlib import pyplot as plt

TRAIN_FOLDER = "20news-bydate-train"
TEST_FOLDER = "20news-bydate-test"
batch_size = 32

categories = ['comp.graphics', 'rec.sport.baseball', 'talk.politics.guns']

def download_20newsgroups(target_dir):
    """Download the 20 newsgroups data and stored it as a zipped pickle."""
    train_path = os.path.join(target_dir, TRAIN_FOLDER)
    test_path = os.path.join(target_dir, TEST_FOLDER)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    archive_path = "./data/20news-bydate.tar.gz"

    tarfile.open(archive_path, "r:gz").extractall(path=target_dir)

    # Store a zipped pickle
    cache = dict(train=load_files(train_path, encoding='latin1'),
                 test=load_files(test_path, encoding='latin1'))

    shutil.rmtree(target_dir)
    return cache

def _fetch_20newsgroups(data_home=None, subset='train', categories=None,
                       shuffle=True, random_state=42,
                       remove=(),
                       download_if_missing=True):

    twenty_home = os.path.join("./", "20news_home")
    cache = download_20newsgroups(target_dir=twenty_home)

    if subset in ('train', 'test'):
        data = cache[subset]
    elif subset == 'all':
        data_lst = list()
        target = list()
        filenames = list()
        for subset in ('train', 'test'):
            data = cache[subset]
            data_lst.extend(data.data)
            target.extend(data.target)
            filenames.extend(data.filenames)

        data.data = data_lst
        data.target = np.array(target)
        data.filenames = np.array(filenames)
    else:
        raise ValueError(
            "subset can only be 'train', 'test' or 'all', got '%s'" % subset)

    data.description = 'the 20 newsgroups by date dataset'

    if 'headers' in remove:
        data.data = [strip_newsgroup_header(text) for text in data.data]
    if 'footers' in remove:
        data.data = [strip_newsgroup_footer(text) for text in data.data]
    if 'quotes' in remove:
        data.data = [strip_newsgroup_quoting(text) for text in data.data]

    if categories is not None:
        labels = [(data.target_names.index(cat), cat) for cat in categories]
        # Sort the categories to have the ordering of the labels
        labels.sort()
        labels, categories = zip(*labels)
        mask = np.in1d(data.target, labels)
        data.filenames = data.filenames[mask]
        data.target = data.target[mask]
        # searchsorted to have continuous labels
        data.target = np.searchsorted(labels, data.target)
        data.target_names = list(categories)
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.data, dtype=object)
        data_lst = data_lst[mask]
        data.data = data_lst.tolist()

    if shuffle:
        random_state = check_random_state(random_state)
        indices = np.arange(data.target.shape[0])
        random_state.shuffle(indices)
        data.filenames = data.filenames[indices]
        data.target = data.target[indices]
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.data, dtype=object)
        data_lst = data_lst[indices]
        data.data = data_lst.tolist()

    return data

(E:\VSCodeVenv\hovernet) E:\VSCodeProjects\hovernet-honernet111\pip_cache>pip install --no-index --find-links=. -r ..\requirements.txt Looking in links: . Processing e:\vscodeprojects\hovernet-honernet111\pip_cache\docopt-0.6.2.tar.gz ERROR: Exception: Traceback (most recent call last): File "E:\VSCodeVenv\hovernet\lib\tarfile.py", line 1645, in gzopen t = cls.taropen(name, mode, fileobj, **kwargs) File "E:\VSCodeVenv\hovernet\lib\tarfile.py", line 1621, in taropen return cls(name, mode, fileobj, **kwargs) File "E:\VSCodeVenv\hovernet\lib\tarfile.py", line 1484, in __init__ self.firstmember = self.next() File "E:\VSCodeVenv\hovernet\lib\tarfile.py", line 2299, in next tarinfo = self.tarinfo.fromtarfile(self) File "E:\VSCodeVenv\hovernet\lib\tarfile.py", line 1092, in fromtarfile buf = tarfile.fileobj.read(BLOCKSIZE) File "E:\VSCodeVenv\hovernet\lib\gzip.py", line 276, in read return self._buffer.read(size) File "E:\VSCodeVenv\hovernet\lib\_compression.py", line 68, in readinto data = self.read(len(byte_view)) File "E:\VSCodeVenv\hovernet\lib\gzip.py", line 463, in read if not self._read_gzip_header(): File "E:\VSCodeVenv\hovernet\lib\gzip.py", line 411, in _read_gzip_header raise OSError('Not a gzipped file (%r)' % magic) OSError: Not a gzipped file (b'<h') During handling of the above exception, another exception occurred: Traceback (most recent call last): File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\cli\base_command.py", line 224, in _main status = self.run(options, args) File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\cli\req_command.py", line 180, in wrapper return func(self, options, args) File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\commands\install.py", line 321, in run reqs, check_supported_wheels=not options.target_dir File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\resolution\resolvelib\resolver.py", line 122, in resolve requirements, max_rounds=try_to_avoid_resolution_too_deep, File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_vendor\resolvelib\resolvers.py", line 445, in resolve state = resolution.resolve(requirements, max_rounds=max_rounds) File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_vendor\resolvelib\resolvers.py", line 339, in resolve failure_causes = self._attempt_to_pin_criterion(name, criterion) File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_vendor\resolvelib\resolvers.py", line 207, in _attempt_to_pin_criterion criteria = self._get_criteria_to_update(candidate) File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_vendor\resolvelib\resolvers.py", line 198, in _get_criteria_to_update for r in self._p.get_dependencies(candidate): File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\resolution\resolvelib\provider.py", line 102, in get_dependencies for r in candidate.iter_dependencies(with_requires) File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\resolution\resolvelib\provider.py", line 101, in <listcomp> r File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\resolution\resolvelib\candidates.py", line 252, in iter_dependencies requires = self.dist.requires() if with_requires else () File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\resolution\resolvelib\candidates.py", line 234, in dist self._prepare() File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\resolution\resolvelib\candidates.py", line 221, in _prepare dist = self._prepare_distribution() File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\resolution\resolvelib\candidates.py", line 314, in _prepare_distribution self._ireq, parallel_builds=True, File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\operations\prepare.py", line 480, in prepare_linked_requirement return self._prepare_linked_requirement(req, parallel_builds) File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\operations\prepare.py", line 505, in _prepare_linked_requirement self.download_dir, hashes, File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\operations\prepare.py", line 263, in unpack_url unpack_file(file.path, location, file.content_type) File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\utils\unpacking.py", line 270, in unpack_file untar_file(filename, location) File "E:\VSCodeVenv\hovernet\lib\site-packages\pip\_internal\utils\unpacking.py", line 187, in untar_file tar = tarfile.open(filename, mode) File "E:\VSCodeVenv\hovernet\lib\tarfile.py", line 1591, in open return func(name, filemode, fileobj, **kwargs) File "E:\VSCodeVenv\hovernet\lib\tarfile.py", line 1649, in gzopen raise ReadError("not a gzip file") tarfile.ReadError: not a gzip file 这些包是不是存在问题
最新发布
08-11
[/public/home/pengjy/anaconda3] >>> PREFIX=/public/home/pengjy/anaconda3 WARNING: md5sum mismatch of tar archive expected: 8a581514493c9e0a1cbd425bc1c7dd90 got: 614f6284c34f91affd38a1be2e4be076 - Unpacking payload ... Traceback (most recent call last): File "entry_point.py", line 76, in <module> File "tarfile.py", line 2024, in extractall File "tarfile.py", line 2065, in extract File "tarfile.py", line 2137, in _extract_member File "tarfile.py", line 2186, in makefile File "tarfile.py", line 249, in copyfileobj tarfile.ReadError: unexpected end of data [210095] Failed to execute script entry_point concurrent.futures.process._RemoteTraceback: ''' Traceback (most recent call last): File "concurrent/futures/process.py", line 368, in _queue_management_worker File "multiprocessing/connection.py", line 251, in recv TypeError: __init__() missing 1 required positional argument: 'msg' ''' The above exception was the direct cause of the following exception: Traceback (most recent call last): File "entry_point.py", line 69, in <module> File "concurrent/futures/process.py", line 484, in _chain_from_iterable_of_lists File "concurrent/futures/_base.py", line 611, in result_iterator File "concurrent/futures/_base.py", line 439, in result File "concurrent/futures/_base.py", line 388, in __get_result concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending. [210105] Failed to execute script entry_point 是什么问题,如何解决?
07-25
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值