Pyhton零基础投喂(综合练习:4.论⽂文种类分类).py

这是一个Python初学者的综合练习,内容涉及对170618条数据进行处理,每条数据包含3列或2列信息,可能是为了进行论文类型的分类任务。
# 导⼊所需的package
import seaborn as sns #⽤于画图
from bs4 import BeautifulSoup #⽤于爬取arxiv的数据
import re #⽤于正则表达式,匹配字符串的模式
import requests #⽤于⽹络连接,发送⽹络请求,使⽤域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图⼯具

data = [] #初始化
#使⽤用with语句句优势: 1.⾃自动关闭⽂文件句句柄; 2.⾃自动显示(处理理)⽂文件读取数据异常
with open("E:\\DW学习\\Python 学习代码\\arxiv-metadata-oai-2019.json", 'r') as f:
    for idx, line in enumerate(f):
        d = json.loads(line)
        d = {'title': d['title'], 'categories': d['categories'], 'abstract': d['abstract']}
        data.append(d)
        
        #选择部分数据
        if idx > 200000:
            break
data = pd.DataFrame(data) #将list变为dataframe格式,⽅方便便使⽤用pandas进⾏行行分析
data
titlecategoriesabstract
0Remnant evolution after a carbon-oxygen white ...astro-phWe systematically explore the evolution of t...
1Cofibrations in the Category of Frolicher Spac...math.ATCofibrations are defined in the category of ...
2Torsional oscillations of longitudinally inhom...astro-phWe explore the effect of an inhomogeneous ma...
3On the Energy-Momentum Problem in Static Einst...gr-qcThis paper has been removed by arXiv adminis...
4The Formation of Globular Cluster Systems in M...astro-phThe most massive elliptical galaxies show a ...
............
170613Enhancement of Magneto-Optic Effects via Large...quant-phWe utilize the generation of large atomic co...
170614Explicit and Exact Solutions to a Kolmogorov-P...solv-int nlin.SISome explicit traveling wave solutions to a ...
170615Linear r-Matrix Algebra for a Hierarchy of One...solv-int nlin.SIWe consider a hierarchy of many-particle sys...
170616Pfaff tau-functionssolv-int adap-org hep-th nlin.AO nlin.SIConsider the evolution $$ \frac{\pl m_\iy}{\...
170617The General Solution of the Complex Monge-Amp\...solv-int nlin.SIA general solution to the Complex Monge-Amp\...

170618 rows × 3 columns

data['text'] = data['title'] + data['abstract']
data['text']
0         Remnant evolution after a carbon-oxygen white ...
1         Cofibrations in the Category of Frolicher Spac...
2         Torsional oscillations of longitudinally inhom...
3         On the Energy-Momentum Problem in Static Einst...
4         The Formation of Globular Cluster Systems in M...
                                ...                        
170613    Enhancement of Magneto-Optic Effects via Large...
170614    Explicit and Exact Solutions to a Kolmogorov-P...
170615    Linear r-Matrix Algebra for a Hierarchy of One...
170616    Pfaff tau-functions  Consider the evolution $$...
170617    The General Solution of the Complex Monge-Amp\...
Name: text, Length: 170618, dtype: object
data['text'] = data['text'].apply(lambda x: x.replace('\n',' '))  #把换行符删除
data['text']
0         Remnant evolution after a carbon-oxygen white ...
1         Cofibrations in the Category of Frolicher Spac...
2         Torsional oscillations of longitudinally inhom...
3         On the Energy-Momentum Problem in Static Einst...
4         The Formation of Globular Cluster Systems in M...
                                ...                        
170613    Enhancement of Magneto-Optic Effects via Large...
170614    Explicit and Exact Solutions to a Kolmogorov-P...
170615    Linear r-Matrix Algebra for a Hierarchy of One...
170616    Pfaff tau-functions  Consider the evolution $$...
170617    The General Solution of the Complex Monge-Amp\...
Name: text, Length: 170618, dtype: object
data['text'] = data['text'].apply(lambda x: x.lower()) #lower() 方法转换字符串中所有大写字符为小写。
data['text']
0         remnant evolution after a carbon-oxygen white ...
1         cofibrations in the category of frolicher spac...
2         torsional oscillations of longitudinally inhom...
3         on the energy-momentum problem in static einst...
4         the formation of globular cluster systems in m...
                                ...                        
170613    enhancement of magneto-optic effects via large...
170614    explicit and exact solutions to a kolmogorov-p...
170615    linear r-matrix algebra for a hierarchy of one...
170616    pfaff tau-functions  consider the evolution $$...
170617    the general solution of the complex monge-amp\...
Name: text, Length: 170618, dtype: object
data = data.drop(['abstract', 'title'], axis=1)
data
categoriestext
0astro-phremnant evolution after a carbon-oxygen white ...
1math.ATcofibrations in the category of frolicher spac...
2astro-phtorsional oscillations of longitudinally inhom...
3gr-qcon the energy-momentum problem in static einst...
4astro-phthe formation of globular cluster systems in m...
.........
170613quant-phenhancement of magneto-optic effects via large...
170614solv-int nlin.SIexplicit and exact solutions to a kolmogorov-p...
170615solv-int nlin.SIlinear r-matrix algebra for a hierarchy of one...
170616solv-int adap-org hep-th nlin.AO nlin.SIpfaff tau-functions consider the evolution $$...
170617solv-int nlin.SIthe general solution of the complex monge-amp\...

170618 rows × 2 columns

# 多个类别,包含⼦子分类
data['categories'] = data['categories'].apply(lambda x : x.split(' '))
data['categories']
0                                             [astro-ph]
1                                              [math.AT]
2                                             [astro-ph]
3                                                [gr-qc]
4                                             [astro-ph]
                               ...                      
170613                                        [quant-ph]
170614                               [solv-int, nlin.SI]
170615                               [solv-int, nlin.SI]
170616    [solv-int, adap-org, hep-th, nlin.AO, nlin.SI]
170617                               [solv-int, nlin.SI]
Name: categories, Length: 170618, dtype: object
data['categories_big'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])
data['categories_big']
0                                       [astro-ph]
1                                           [math]
2                                       [astro-ph]
3                                          [gr-qc]
4                                       [astro-ph]
                            ...                   
170613                                  [quant-ph]
170614                            [solv-int, nlin]
170615                            [solv-int, nlin]
170616    [solv-int, adap-org, hep-th, nlin, nlin]
170617                            [solv-int, nlin]
Name: categories_big, Length: 170618, dtype: object
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['categories_big'].iloc[:])
data_label

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])
data_tfidf
<170618x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 13410005 stored elements in Compressed Sparse Row format>
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, data_label,test_size = 0.2,random_state =1)

from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         0
           3       0.91      0.85      0.88      3625
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       0.77      0.76      0.77      3801
           9       0.84      0.89      0.86     10715
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00       186
          12       0.44      0.41      0.42      1621
          13       0.00      0.00      0.00         1
          14       0.75      0.59      0.66      1096
          15       0.61      0.80      0.69      1078
          16       0.90      0.19      0.32       242
          17       0.53      0.67      0.59      1451
          18       0.71      0.54      0.62      1400
          19       0.88      0.84      0.86     10243
          20       0.40      0.09      0.15       934
          21       0.00      0.00      0.00         1
          22       0.88      0.03      0.07       414
          23       0.48      0.65      0.55       517
          24       0.37      0.33      0.35       539
          25       0.00      0.00      0.00         1
          26       0.60      0.42      0.49      3891
          27       0.00      0.00      0.00         0
          28       0.82      0.08      0.15       676
          29       0.86      0.12      0.21       297
          30       0.80      0.40      0.53      1714
          31       0.00      0.00      0.00         4
          32       0.56      0.65      0.60      3398
          33       0.00      0.00      0.00         0

   micro avg       0.76      0.70      0.72     47851
   macro avg       0.39      0.27      0.29     47851
weighted avg       0.75      0.70      0.71     47851
 samples avg       0.74      0.76      0.72     47851



D:\Anocanda1\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
D:\Anocanda1\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
D:\Anocanda1\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值