1.编写脚本文件将每个分类拷贝6500个文件
MAXCOUNT=6500
for category in $( ls THUCNews); do
echo item: $category
dir=THUCNews/$category
newdir=data/thucnews/$category
if [ -d $newdir ]; then
rm -rf $newdir
mkdir $newdir
fi
COUNTER=1
for i in $(ls $dir); do
cp $dir/$i $newdir
if [ $COUNTER -ge $MAXCOUNT ]
then
echo finished
break
fi
let COUNTER=COUNTER+1
done
done
2.把数据分成测试集,训练集,验证集
import os
def _read_file(filename):
"""读取一个文件并转换为一行"""
with open(filename, 'r', encoding='utf-8') as f:
return f.read().replace('\n', '').replace('\t', '').replace('\u3000', '')
def save_file(dirname):
"""
将多个文件整合并存到3个文件中
dirname: 原数据目录
文件内容格式: 类别\t内容
"""
f_train = open('data/cnews/cnews.train.txt', 'w', encoding='utf-8')
f_test = open('data/cnews/cnews.test.txt', 'w', encoding='utf-8')
f_val = open('data/cnews/cnews.val.txt', 'w', encoding='utf-8')
for category in os.listdir(dirname): # 分类目录
cat_dir = os.path.join(dirname, category)
if not os.path.isdir(cat_dir