# 导⼊所需的package
import seaborn as sns #⽤于画图
from bs4 import BeautifulSoup #⽤于爬取arxiv的数据
import re #⽤于正则表达式,匹配字符串的模式
import requests #⽤于⽹络连接,发送⽹络请求,使⽤域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图⼯具
data = [] #初始化
#使⽤用with语句句优势: 1.⾃自动关闭⽂文件句句柄; 2.⾃自动显示(处理理)⽂文件读取数据异常
with open("E:\\DW学习\\Python 学习代码\\arxiv-metadata-oai-2019.json", 'r') as f:
for idx, line in enumerate(f):
d = json.loads(line)
d = {'title': d['title'], 'categories': d['categories'], 'abstract': d['abstract']}
data.append(d)
#选择部分数据
if idx > 200000:
break
data = pd.DataFrame(data) #将list变为dataframe格式,⽅方便便使⽤用pandas进⾏行行分析
data
| title | categories | abstract | |
|---|---|---|---|
| 0 | Remnant evolution after a carbon-oxygen white ... | astro-ph | We systematically explore the evolution of t... |
| 1 | Cofibrations in the Category of Frolicher Spac... | math.AT | Cofibrations are defined in the category of ... |
| 2 | Torsional oscillations of longitudinally inhom... | astro-ph | We explore the effect of an inhomogeneous ma... |
| 3 | On the Energy-Momentum Problem in Static Einst... | gr-qc | This paper has been removed by arXiv adminis... |
| 4 | The Formation of Globular Cluster Systems in M... | astro-ph | The most massive elliptical galaxies show a ... |
| ... | ... | ... | ... |
| 170613 | Enhancement of Magneto-Optic Effects via Large... | quant-ph | We utilize the generation of large atomic co... |
| 170614 | Explicit and Exact Solutions to a Kolmogorov-P... | solv-int nlin.SI | Some explicit traveling wave solutions to a ... |
| 170615 | Linear r-Matrix Algebra for a Hierarchy of One... | solv-int nlin.SI | We consider a hierarchy of many-particle sys... |
| 170616 | Pfaff tau-functions | solv-int adap-org hep-th nlin.AO nlin.SI | Consider the evolution $$ \frac{\pl m_\iy}{\... |
| 170617 | The General Solution of the Complex Monge-Amp\... | solv-int nlin.SI | A general solution to the Complex Monge-Amp\... |
170618 rows × 3 columns
data['text'] = data['title'] + data['abstract']
data['text']
0 Remnant evolution after a carbon-oxygen white ...
1 Cofibrations in the Category of Frolicher Spac...
2 Torsional oscillations of longitudinally inhom...
3 On the Energy-Momentum Problem in Static Einst...
4 The Formation of Globular Cluster Systems in M...
...
170613 Enhancement of Magneto-Optic Effects via Large...
170614 Explicit and Exact Solutions to a Kolmogorov-P...
170615 Linear r-Matrix Algebra for a Hierarchy of One...
170616 Pfaff tau-functions Consider the evolution $$...
170617 The General Solution of the Complex Monge-Amp\...
Name: text, Length: 170618, dtype: object
data['text'] = data['text'].apply(lambda x: x.replace('\n',' ')) #把换行符删除
data['text']
0 Remnant evolution after a carbon-oxygen white ...
1 Cofibrations in the Category of Frolicher Spac...
2 Torsional oscillations of longitudinally inhom...
3 On the Energy-Momentum Problem in Static Einst...
4 The Formation of Globular Cluster Systems in M...
...
170613 Enhancement of Magneto-Optic Effects via Large...
170614 Explicit and Exact Solutions to a Kolmogorov-P...
170615 Linear r-Matrix Algebra for a Hierarchy of One...
170616 Pfaff tau-functions Consider the evolution $$...
170617 The General Solution of the Complex Monge-Amp\...
Name: text, Length: 170618, dtype: object
data['text'] = data['text'].apply(lambda x: x.lower()) #lower() 方法转换字符串中所有大写字符为小写。
data['text']
0 remnant evolution after a carbon-oxygen white ...
1 cofibrations in the category of frolicher spac...
2 torsional oscillations of longitudinally inhom...
3 on the energy-momentum problem in static einst...
4 the formation of globular cluster systems in m...
...
170613 enhancement of magneto-optic effects via large...
170614 explicit and exact solutions to a kolmogorov-p...
170615 linear r-matrix algebra for a hierarchy of one...
170616 pfaff tau-functions consider the evolution $$...
170617 the general solution of the complex monge-amp\...
Name: text, Length: 170618, dtype: object
data = data.drop(['abstract', 'title'], axis=1)
data
| categories | text | |
|---|---|---|
| 0 | astro-ph | remnant evolution after a carbon-oxygen white ... |
| 1 | math.AT | cofibrations in the category of frolicher spac... |
| 2 | astro-ph | torsional oscillations of longitudinally inhom... |
| 3 | gr-qc | on the energy-momentum problem in static einst... |
| 4 | astro-ph | the formation of globular cluster systems in m... |
| ... | ... | ... |
| 170613 | quant-ph | enhancement of magneto-optic effects via large... |
| 170614 | solv-int nlin.SI | explicit and exact solutions to a kolmogorov-p... |
| 170615 | solv-int nlin.SI | linear r-matrix algebra for a hierarchy of one... |
| 170616 | solv-int adap-org hep-th nlin.AO nlin.SI | pfaff tau-functions consider the evolution $$... |
| 170617 | solv-int nlin.SI | the general solution of the complex monge-amp\... |
170618 rows × 2 columns
# 多个类别,包含⼦子分类
data['categories'] = data['categories'].apply(lambda x : x.split(' '))
data['categories']
0 [astro-ph]
1 [math.AT]
2 [astro-ph]
3 [gr-qc]
4 [astro-ph]
...
170613 [quant-ph]
170614 [solv-int, nlin.SI]
170615 [solv-int, nlin.SI]
170616 [solv-int, adap-org, hep-th, nlin.AO, nlin.SI]
170617 [solv-int, nlin.SI]
Name: categories, Length: 170618, dtype: object
data['categories_big'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])
data['categories_big']
0 [astro-ph]
1 [math]
2 [astro-ph]
3 [gr-qc]
4 [astro-ph]
...
170613 [quant-ph]
170614 [solv-int, nlin]
170615 [solv-int, nlin]
170616 [solv-int, adap-org, hep-th, nlin, nlin]
170617 [solv-int, nlin]
Name: categories_big, Length: 170618, dtype: object
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['categories_big'].iloc[:])
data_label
array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 1, 0, 0],
[0, 1, 0, ..., 1, 0, 0],
[0, 0, 0, ..., 1, 0, 0]])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])
data_tfidf
<170618x4000 sparse matrix of type '<class 'numpy.float64'>'
with 13410005 stored elements in Compressed Sparse Row format>
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, data_label,test_size = 0.2,random_state =1)
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))
precision recall f1-score support
0 0.00 0.00 0.00 0
1 0.00 0.00 0.00 1
2 0.00 0.00 0.00 0
3 0.91 0.85 0.88 3625
4 0.00 0.00 0.00 4
5 0.00 0.00 0.00 0
6 0.00 0.00 0.00 1
7 0.00 0.00 0.00 0
8 0.77 0.76 0.77 3801
9 0.84 0.89 0.86 10715
10 0.00 0.00 0.00 0
11 0.00 0.00 0.00 186
12 0.44 0.41 0.42 1621
13 0.00 0.00 0.00 1
14 0.75 0.59 0.66 1096
15 0.61 0.80 0.69 1078
16 0.90 0.19 0.32 242
17 0.53 0.67 0.59 1451
18 0.71 0.54 0.62 1400
19 0.88 0.84 0.86 10243
20 0.40 0.09 0.15 934
21 0.00 0.00 0.00 1
22 0.88 0.03 0.07 414
23 0.48 0.65 0.55 517
24 0.37 0.33 0.35 539
25 0.00 0.00 0.00 1
26 0.60 0.42 0.49 3891
27 0.00 0.00 0.00 0
28 0.82 0.08 0.15 676
29 0.86 0.12 0.21 297
30 0.80 0.40 0.53 1714
31 0.00 0.00 0.00 4
32 0.56 0.65 0.60 3398
33 0.00 0.00 0.00 0
micro avg 0.76 0.70 0.72 47851
macro avg 0.39 0.27 0.29 47851
weighted avg 0.75 0.70 0.71 47851
samples avg 0.74 0.76 0.72 47851
D:\Anocanda1\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
D:\Anocanda1\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
D:\Anocanda1\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
这是一个Python初学者的综合练习,内容涉及对170618条数据进行处理,每条数据包含3列或2列信息,可能是为了进行论文类型的分类任务。
954

被折叠的 条评论
为什么被折叠?



