去接头后的fastq格式需要转为unique的fasta格式,用于miRDP的下一步分析。用python的.count()感觉速度太慢,用awk速度很快。
head -n 12 SRR7406454_trimmed.fq
@SRR7406454.1 HISEQ:279:HVMFNBCXX:1:1101:1442:2039 length=50
NTTGGATTGAAGGGAGCTCTA
+SRR7406454.1 HISEQ:279:HVMFNBCXX:1:1101:1442:2039 length=50
#<D@DHII@EHC1EHE<<<D<
@SRR7406454.2 HISEQ:279:HVMFNBCXX:1:1101:1316:2109 length=50
NATTCGCTACTGTAAACCCGGTGC
+SRR7406454.2 HISEQ:279:HVMFNBCXX:1:1101:1316:2109 length=50
#<DDDHIIIIIIIIIIIIIIIIII
@SRR7406454.3 HISEQ:279:HVMFNBCXX:1:1101:1386:2141 length=50
TGCACCTAAGTAATCTAATCGGCT
+SRR7406454.3 HISEQ:279:HVMFNBCXX:1:1101:1386:2141 length=50
DDDDDIHHHHIHHIIIIIIIIIHI
awk '{if(NR%4==2)print $0}' SRR7406454_trimmed.fq | sort | uniq -c | awk -F ' ' '{print ">t"NR"_x"$(NF-1)"\n"$NF}' > SRR7406454_trimmed.fa &
>t1_x4
AAAAAAAAAAAAAAAAAA
>t2_x5
AAAAAAAAAAAAAAAAAAA
>t3_x2
AAAAAAAAAAAAAAAAAAAA