这篇日志的题目好想叫做<论编程不好是怎样一种体验>…
文件
1.input file (10万行)
"ClipID","yfcc100m_hash"
"YFCC100M53595ec67db890b7ad32dfa375196dec","53595ec67db890b7ad32dfa375196dec"
"YFCC100Md6d7f154d7ad0a367353b8a4c7f868","d6d7f154d7ad0a367353b8a4c7f868"
"YFCC100Mabfa822c14bcac4c0f45deb97d18b5f","abfa822c14bcac4c0f45deb97d18b5f"
"YFCC100Md665346779ab26f33f3e537462daf3a","d665346779ab26f33f3e537462daf3a"
"YFCC100M2c7480e2c372a568f5efe91e323ea022","2c7480e2c372a568f5efe91e323ea022"
"YFCC100Mf2b1ab759b93a444efcff0f1d8db86ad","f2b1ab759b93a444efcff0f1d8db86ad"
"YFCC100M3430141d3e43fd1dc903147ecbbe3c","3430141d3e43fd1dc903147ecbbe3c"
"YFCC100M3fd1fb7996b492bc324f50fe6510cd31","3fd1fb7996b492bc324f50fe6510cd31"
"YFCC100M5bab830a1651bbcd45476a2a589d8","5bab830a1651bbcd45476a2a589d8"
"YFCC100Me967f4132a9490dea9890d551d4313e","e967f4132a9490dea9890d551d4313e"
"YFCC100M7617ffd88e62dcec4563f958eb879","7617ffd88e62dcec4563f958eb879"
"YFCC100Mc397a76d6b461ab0319bbadb2ee7997","c397a76d6b461ab0319bbadb2ee7997"
"YFCC100Mb44854aeaa7ab878b606699bf46a94a","b44854aeaa7ab878b606699bf46a94a"
"YFCC100M4db1d3a7dafad886ebaf4121aff4a921","4db1d3a7dafad886ebaf4121aff4a921"
"YFCC100M716414c3a7cc5dca529a9dd32a8d9bed","716414c3a7cc5dca529a9dd32a8d9bed"
"YFCC100M721b1d73bfad7edb75e364bce5a7ba88","721b1d73bfad7edb75e364bce5a7ba88"
"YFCC100Md6452731b2e6fd49bdc81791b522","d6452731b2e6fd49bdc81791b522"
"YFCC100Mdccadda6f254e2c0612b563cc0fc2ff8","dccadda6f254e2c0612b563cc0fc2ff8"
"YFCC100Mf0db32256c23de67e3f42824b622d1d5","f0db32256c23de67e3f42824b622d1d5"
2.hash file (4G)
//identifier, hash value
37941 f81e7c3af3799b60b7dbd514c67184af
37946 dfdb5ed692372b7fde1be75a17b45fcc
38121 df85d12e831af19b1e5ccff79169df45
38129 d919459df4a5d7b0a2ae56bbfcffa9dd
38143 32d6f03a3dc5ad04af8919e333e95d7
38225 f4281b87d46177294726c7d933366a2
38389 7ff9a834d0d25f47d50a3d9bec68dd
38395 017fdbe9f414c3933cb92dc64fc6d56
38397 716b13e1bfa95c40569b817a5ebccf0
38408 e05beeaa8528a98742ceae5ac4f2718
38414 c239c75be9558cedd65a6b1a5e6deee8
38425 516754853cb6f40682f5d28d3458f
69471 ecea8b8d25ba22c3326136666927e0
69500 f02cb5ad1113481e9368bd3aa09b971a
69604 7ebf6e2e9ccda758a525a1af70123f
69606 50c1c3d43ea4d1dd70579681b4adf15
69725 65ec648d1525d9158c5f6b5fce2ee8
69802 e377a4faf03e6a26cd845fff5cce44
69895 53e6707335874eac29b69b5782f94061
69941 ae0915a8b9233286ebf29e5a981c7d
70023 3ef140ec0f976448fa85e779fdcdde
70110 7c1db88bea789d5e238580f9c776f99e
70112 e8e67e98fdfce778e31cd9c64c12b33
70116 c72674eddec0635d639fb364227d5e32
70117 7532f1654150877f4ea404692a35586
70118 45f7fdd9ea4bdd9631ba75f2517b3e9
70167 c665ad8761df1393ce9d35fdba67c6
70171 7bbbd2607517b199818fd135a8727b1
70174 45c6ba356fa5b5e24de4d4ce8c7a4cba
70190 ab29d9418269647f3ceb20abea6114d
70191 8bee6dc05fbdceb5f3593aa4fc83e14d
70215 9936d88599246d74e7a7f12d53654f
70226 5633de7aa75a65ebec7d4557f522e36e
70340 2b66aa778087ff52465361490cdcff6
70851 b4f09d0ac4cddbf3466fdd1b125324
70861 36ff74ff35b8340831b1472d445e44f
70903 6dc2bfe9107159c26edd8697207dcaa4
70978 6a7c8fdfbcfdf1e64552841e41ba19ec
71223 2a8d60a33daecd1fe4fa27b83d2626
71225 cc3c574dd9b248f072361299b78f1397
71355 11ac4e657471b873c2f0ea6aa8f8b9f
71393 c5215949c49cc4a579e6ca7cc55acbfa
71463 65b3b94c6a39b39b786972d2d83e6cf
71468 be801753855b6dd99d648513afb8fed
71530 7c6ef37935c9b641106732018c8ed7b
71957 d542d37b6fdeb1e323a94c191729bb51
72033 e4e8ed141c5aa9dae56ec86cdda7c599
3.line file(1.9G)
//line number, identifier
0 32424
1 10201275523
2 2297552664
3 7289030198
4 4140939180
5 9506922316
6 4436463882
7 4572998878
8 9329902958
9 36306
10 3973434963
11 5155593741
12 2932067831
13 2321187909
14 3710430638
15 4923770586
16 8645906636
17 2409783581
18 4445358691
19 7621544498
20 1076009973
4.data file (45G)
//通过line_number找到data file里对应的那一行. data file中每行有20个词, 以\t分割. 第一个是identifier. 第15个是video/image的下载地址.
程序
程序1
//5min处理一行数据,处理完10w行要到世界末日了吧…
#!/bin/bash
#evalfile=/work/na/Trecvid/2016/MED16-InputFiles_20160701/MED16-EvalFull-YFCC100M_Subset.csv
evalfile=/work/na/from_rabbit/test/YFCC/MED16-EvalFull-YFCC100M_Subset.csv
hashfile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_hash
linefile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_lines
datafile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_dataset
res=/work/na/from_rabbit/test/YFCC/allInformation
videolist=/work/na/from_rabbit/test/YFCC/videolist
log=/work/na/from_rabbit/test/YFCC/log
i=0
for hash in ` sed 's/\"//g' ${evalfile} | tail -n +2 | cut -d',' -f2 `
do
i=$[i+1]
echo $i >> $log
echo ` date +%Y.%m.%d-%k:%M:%S-%N ` >> $log
identifier=`grep $hash $hashfile | cut -d$'\t' -f1` ##30s
lineNumber=`grep $identifier $linefile | cut -d$'\t' -f1` ##15s
lineNumber=$[lineNumber+1]
# data=`sed "${lineNumber}q;d" $datafile`
# data=`tail -n+${lineNumber} $datafile | head -n1`#
download_url=`sed "${lineNumber}q;d" $datafile | cut -d$'\t' -f15` ##4min
echo ` date +%Y.%m.%d-%k:%M:%S-%N ` >> $log
echo "$lineNumber,$identifier,$hash,$download_url" >> $res
echo "$lineNumber,$download_url" >> $videolist
done
程序2
#!/bin/bash
evalfile=/work/na/Trecvid/2016/MED16-InputFiles_20160701/MED16-EvalFull-YFCC100M_Subset.csv
#evalfile=/work/na/from_rabbit/test/YFCC/MED16-EvalFull-YFCC100M_Subset.csv
hashfile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_hash
linefile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_lines
datafile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_dataset
#2秒钟一行
i=0
run1="no"
if [ $run1 == "yes" ];then
for hash in ` sed 's/\"//g' ${evalfile} | tail -n +2 | cut -d',' -f2 `
do
i=$[i+1]
echo $i >> log_identifier
echo ` date +%Y.%m.%d-%k:%M:%S-%N ` >> log_identifier
identifier=`grep $hash $hashfile | cut -d$'\t' -f1`
echo "$i,$identifier" >> all_identifier
done
fi
#2秒一行
i=0
run2="no"
if [ $run2 == "yes" ];then
for identifier in ` cut -d',' -f2 all_identifier`
do
i=$[i+1]
echo $i >> log_lineNumber
echo ` date +%Y.%m.%d-%k:%M:%S-%N ` >> log_lineNumber
lineNumber=`grep $identifier $linefile | cut -d$'\t' -f1`
lineNumber=$[lineNumber+1]
echo "$i,$lineNumber" >> all_lineNumber
done
fi
#一行4分钟,瓶颈在这里
i=0
run3="yes"
if [ $run3 == "yes" ];then
#for lineNumber in ` cut -d',' -f2 $all_lineNumber`
for lineNumber in ` cat all_lineNumber`
do
i=$[i+1]
echo $i #>> log_downloadurl
echo ` date +%Y.%m.%d-%k:%M:%S-%N ` #>> log_downloadurl
download_url=`sed "${lineNumber}q;d" $datafile | cut -d$'\t' -f15`
echo "$i,$download_url" >> all_downloadurl
done
fi
程序三
#!/bin/bash
evalfile=/work/na/Trecvid/2016/MED16-InputFiles_20160701/MED16-EvalFull-YFCC100M_Subset.csv
hashfile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_hash
linefile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_lines
datafile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_dataset
sed 's/\"//g' ${evalfile} | tail -n +2 | cut -d',' -f2 > all_hash
LC_ALL=C fgrep -f $all_hash $hashfile | awk '{print "\t"$1}' >> all_t_identifier
#linefile每行为 "line_number(tab分隔符)identifier"
LC_ALL=C fgrep -f all_t_identifier $linefile | awk '{print $1+1}' >> all_lineNumber
#还是很慢
cat $datafile | cut -d$'\t' -f15 > data_15.txt
for lineNumber in ` cat $all_lineNumber`
do
i=$[i+1]
if [ $i -gt $thresh ];then
#echo $i >> $log_downloadurl
#echo ` date +%Y.%m.%d-%k:%M:%S-%N ` >> $log_downloadurl
#download_url=`sed "${lineNumber}q;d" $datafile | cut -d$'\t' -f15`
download_url=`sed "${lineNumber}q;d" data_15.txt`
echo "$download_url" >> $all_downloadurl
fi
done
重点是我之前一直觉得某个输出文档的顺序都得和输入文档的顺序相同,而这里fgrep
的输出是排序后的(最后也没有找到方法解决).
但是不管怎么样现在的重点是根据input文件, 找到data文件里所有的对应视频进行下载, 现在先不用管顺序了.
code 4
#!/bin/bash
evalfile=/work/na/Trecvid/2016/MED16-InputFiles_20160701/MED16-EvalFull-YFCC100M_Subset.csv
hashfile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_hash
linefile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_lines
datafile=/work/na/Trecvid/2016/YFCC100M/download/yfcc100m_dataset
sed 's/\"//g' ${evalfile} | tail -n +2 | cut -d',' -f2 > all_hash
LC_ALL=C fgrep -f $all_hash $hashfile | awk '{print "\t"$1}' >> all_t_identifier
#data_2.txt 每行为"identifier(tab分隔符)下载地址"
cat $datafile | cut -d$'\t' -f1,15 > data_2.txt
for identifier in ` cat $all_t_identifier | cut -d$'\t' -f2 `
do
echo "$identifier "
done > all_identifier_t
LC_ALL=C fgrep -f all_identifier_t data_2.txt | awk '{print $2}' >> download.txt
有个悲剧是download.txt的输出是10,0039行.
因为在匹配的时候LC_ALL=C fgrep -f all_identifier_t data_2.txt | awk '{print $2}'
, 如果all_identifier_t文件里有一个3431612105
, 那么data_2.txt文件里的13431612105
和3431612105
那两行都会被选中.
用grep的话,
grep man * 会匹配 ‘Batman’、’manic’、’man’等,
grep ‘\<man’ * 匹配’manic’和’man’,但不是’Batman’,
grep ‘\<man\>’ 只匹配’man’,而不是’Batman’或’manic’等其他的字符串。
‘^’:指匹配的字符串在行首,
‘$’:指匹配的字符串在行 尾,
可是fgrep不支持正则表达式.
ctrl+f了一下download.txt中的”jpg”, 正好39个,手动删除了…
cat line X to line Y on a huge file
http://unix.stackexchange.com/questions/47407/cat-line-x-to-line-y-on-a-huge-file
Speed test:
100,000,000-line file generated by seq 100000000 > test.in
Reading lines 50,000,000-50,000,010
Tests in no particular order
real time as reported by bash's builtin time
4.373 4.418 4.395 tail -n+50000000 test.in | head -n10
5.210 5.179 6.181 sed -n '50000000,50000010p;57890010q' test.in
5.525 5.475 5.488 head -n50000010 test.in | tail -n10
8.497 8.352 8.438 sed -n '50000000,50000010p' test.in
22.826 23.154 23.195 tail -n50000001 test.in | head -n10
25.694 25.908 27.638 ed -s test.in <<<"50000000,50000010p"
31.348 28.140 30.574 awk 'NR<57890000{next}1;NR==57890010{exit}' test.in
51.359 50.919 51.127 awk 'NR >= 57890000 && NR <= 57890010' test.in
These are by no means precise benchmarks, but the difference is clear and repeatable enough* to give a good sense of the relative speed of each of these commands.
*: Except between the first two, sed -n p;q and head|tail, which seem to be essentially the same.