3000道POJ英文题中高频词汇--HADOOP单词统计

本文介绍了使用Hadoop MapReduce进行POJ英文题目的单词统计,通过对1000到4000题目的数据清洗,去除数字、中文和特殊符号,得出3000道题中约800个不同单词的统计结果,展示了一种数据处理的方法。

最近学习HADOOP,写mapreduce,最简单的单词统计。

以前刷题,英语不好,很烦。

现在统计一下poj上单词。

首先,抓取了1000--4000的英文题目,对数据进行清洗,把不需要的数字,中文,各种奇怪的符号都去掉。

然后直接跑mapreduce

统计出这3000道题才有不到800个不同的单词,先看一下结果。

each	19
are	21
by	21
input	23
that	24
line	26
be	33
will	33
The	41
number	42
and	58
is	60
in	61
to	63
a	65
of	129
the	226


统计结果还是可以被参考的。

爬虫(不太会,low)

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

import org.junit.Test;

public class Pc {

	public static String getHtml(String urlString) {
		try {
			StringBuffer html = new StringBuffer();
			URL url = new URL(urlString);
			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
			InputStreamReader isr = new InputStreamReader(conn.getInputStream());
			BufferedReader br = new BufferedReader(isr);
			String temp;
			while ((temp = br.readLine()) != null) {
				html.append(temp).append("\n");
			}
			br.close();
			isr.close();
			return html.toString();
		} catch (Exception e) {
			e.printStackTrace();
			return null;
		}
	}

	@Test

	public static void xiewenjian(String str) throws Exception {
		byte[] b=str.getBytes();
    	FileOutputStream out = new FileOutputStream("d:/poj11.txt", true);
    	out.write(b);
	}

	public void zhuaqu(String url,int k) throws Exception {
		String s = getHtml(url);
		String head = "<div class=\"ptx\" lang=\"en-US\">";
		int h1 = s.indexOf(head,k);
		if(h1==-1) {
			return;
		}
		int x = h1 + head.length();
		String tail = "</div>";
		int h2 = s.indexOf(tail, x);
		int y = h2;
		String str = s.substring(x, y);
		String str1 = str.replaceAll("\\."," ");
		String str2 = str1.replaceAll("<.+>"," ");
		String str3 = str2.replaceAll(" +", " ");
		String str4 = str3.replaceAll("[^ a-zA-Z]", "");
		xiewenjian(" "+str4);
		zhuaqu(url,y);
	}

	public static void main(String[] args) throws Exception {
		Pc p = new Pc();
		String url0="http://poj.org/problem?id=";
		for(int i=1001;i<=4000;i++) {
			String s=Integer.toString(i);
			String url = url0+s;
			System.out.println(url);
			p.zhuaqu(url,0);
		}

	}

}


统计

package cn.ky.mapreduce.sortwc;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Count {
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(Count.class);
		job.setMapperClass(SMap.class);
		job.setReducerClass(SReduce.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
	}
	
	public static class SMap extends Mapper<LongWritable, Text, Text, IntWritable>{

		Text k=new Text();
		Infbean v=new Infbean();
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			String line=value.toString();
			String[] words=line.split(" ");
			for(String word:words) {
				if(word.equals("")==false) {
					k.set(word);
					context.write(k, new IntWritable(1));
				}
				
			}
		}
	}
	
	public static class SReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
		
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
			int sum=0;
			for(IntWritable bean:values) {
				sum+=bean.get();
			}
			context.write(key, new IntWritable(sum));
		}
		
		
	}
	
	
	
}


排序
package cn.ky.mapreduce.sortwc;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class Sort {
	
	public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(Sort.class);
		job.setMapperClass(SMap.class);
		job.setReducerClass(SReduce.class);
		
		job.setMapOutputKeyClass(Infbean.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Infbean.class);
		job.setOutputValueClass(NullWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
	}
	
	public static class SMap extends Mapper<LongWritable, Text, Infbean, NullWritable>{

		Infbean v=new Infbean();
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Infbean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			String line=value.toString();
			String[] str=line.split("\t");
			v.set(str[0],Integer.parseInt(str[1]));
			context.write(v, NullWritable.get());
		}
	}
	
	public static class SReduce extends Reducer<Infbean, NullWritable, Infbean, NullWritable>{

		@Override
		protected void reduce(Infbean key, Iterable<NullWritable> values,
				Reducer<Infbean, NullWritable, Infbean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			context.write(key, NullWritable.get());
			
		}
		
	}
	

}

自定义类型

package cn.ky.mapreduce.sortwc;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

public class Infbean implements WritableComparable<Infbean>{

	private String word;
	private int count;
	
	public void set(String word,int count) {
		this.word=word;
		this.count=count;
	}
	
	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		this.word=in.readUTF();
		this.count=in.readInt();
		
	}
	
	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		out.writeUTF(word);
		out.writeInt(count);
	}

	/**
	 * 和冒泡的思想差不多,自己理解。
	 */
	@Override
	public int compareTo(Infbean o) {
		// TODO Auto-generated method stub
		if(this.count>o.count) {
			return 1;
		}else {
			return -1;
		}
	}
	
	public String toString() {
		return this.word+"\t"+this.count;
	}

	public String getWord() {
		return word;
	}

	public void setWord(String word) {
		this.word = word;
	}

	public int getCount() {
		return count;
	}

	public void setCount(int count) {
		this.count = count;
	}
	
	
	
	
	
	
	
}


最后全部统计结果
zip	1
zotz	1
writing	1
zac	1
yoxkin	1
AACEDGG	1
AT	1
An	1
yax	1
xul	1
x	1
cumhu	1
court	1
work	1
world	1
currently	1
wont	1
curve	1
Arrange	1
Another	1
counting	1
At	1
whose	1
who	1
whitespace	1
whilescanfsdsnEOF	1
whilescanfsdsn	1
while	1
counted	1
Case	1
when	1
wheather	1
whats	1
B	1
Businesses	1
water	1
Cartesian	1
wants	1
could	1
unambiguous	1
Consider	1
DAABEC	1
using	1
DNA	1
uppercase	1
upper	1
up	1
until	1
unsortedness	1
unsorted	1
unlucky	1
uayet	1
tzec	1
Dit	1
type	1
During	1
twentyfive	1
Exactly	1
F	1
trailing	1
trade	1
total	1
tonight	1
Figure	1
correspond	1
Have	1
From	1
tied	1
Further	1
through	1
Generate	1
thought	1
though	1
Ginos	1
thirteen	1
third	1
H	1
Hes	1
coordinates	1
coordinate	1
convert	1
task	1
However	1
Hut	1
Hyphens	1
th	1
I	1
tens	1
IONU	1
taxing	1
control	1
supplies	1
systems	1
sweep	1
surrounding	1
surprising	1
suppressed	1
constraints	1
starting	1
Imaging	1
Inc	1
stores	1
stopped	1
stock	1
Insignificant	1
step	1
statements	1
state	1
starts	1
debt	1
square	1
Instead	1
It	1
J	1
decided	1
sold	1
specifies	1
spaces	1
K	1
sortedwhile	1
sortedness	1
L	1
somehow	1
some	1
solve	1
solution	1
defined	1
denominations	1
signals	1
singlevalue	1
Leading	1
since	1
simple	1
concentration	1
encoding	1
outside	1
rather	1
check	1
occurrence	1
making	1
computer	1
session	1
automatically	1
run	1
cataloguing	1
boundary	1
minimum	1
civilization	1
request	1
look	1
likely	1
legal	1
Problems	1
justice	1
Some	1
back	1
sharper	1
believe	1
University	1
get	1
him	1
much	1
floating	1
Month	1
extra	1
mean	1
equal	1
seems	1
easier	1
removed	1
composed	1
records	1
alternating	1
Ruritania	1
lately	1
lot	1
knotted	1
sample	1
assume	1
punctuation	1
once	1
sabbatical	1
based	1
differences	1
occupy	1
Two	1
blank	1
alphabetical	1
holly	1
W	1
national	1
having	1
OUTPUT	1
fourth	1
rest	1
Where	1
Years	1
financial	1
match	1
few	1
Philately	1
excluding	1
phrase	1
actual	1
persons	1
endofline	1
equally	1
magnitude	1
performs	1
duplicate	1
made	1
divided	1
live	1
did	1
after	1
description	1
real	1
Rn	1
done	1
arranged	1
read	1
overhang	1
encounted	1
known	1
quality	1
kankin	1
local	1
job	1
purchasing	1
attempts	1
service	1
involving	1
Note	1
into	1
constraint	1
instance	1
THE	1
individual	1
discovered	1
including	1
NumberOfTheDay	1
immediately	1
problems	1
illustrated	1
problem	1
printed	1
second	1
hotel	1
O	1
brute	1
precision	1
her	1
necessarily	1
graduated	1
call	1
possible	1
capitalization	1
result	1
g	1
When	1
forms	1
separate	1
caused	1
eroding	1
floor	1
plural	1
finds	1
about	1
filled	1
achieve	1
field	1
P	1
fact	1
ends	1
experience	1
measured	1
examples	1
sell	1
every	1
phone	1
equivalent	1
requests	1
enough	1
added	1
comments	1
research	1
common	1
either	1
edge	1
edgedetected	1
compiling	1
mac	1
company	1
encodes	1
does	1
locations	1
discovery	1
Property	1
computed	1
allocates	1
dialed	1
Locations	1
detected	1
dont	1
lexicographical	1
remaining	1
reads	1
scanf	1
area	1
per	1
left	1
she	1
Ruritanian	1
shrinking	1
ascending	1
save	1
asks	1
exists	1
koyab	1
outputs	1
S	1
displayed	1
keypad	1
Satellite	1
purposes	1
religious	1
judged	1
orginal	1
Semicircle	1
NameOfTheDay	1
items	1
Service	1
issued	1
do	1
available	1
Sometimes	1
investigating	1
compute	1
inversion	1
Successive	1
Notice	1
Mapper	1
balances	1
Number	1
inside	1
River	1
information	1
That	1
processes	1
algorithm	1
indicated	1
Thus	1
includes	1
row	1
beexactly	1
To	1
begin	1
dial	1
believed	1
U	1
bisects	1
detection	1
born	1
rounded	1
hyphen	1
pax	1
V	1
shown	1
hoping	1
particular	1
hold	1
never	1
hired	1
past	1
highest	1
preceded	1
build	1
Mississippi	1
calculate	1
nearly	1
nd	1
nearest	1
collectors	1
grab	1
endoffile	1
calling	1
Waterloo	1
name	1
chen	1
generally	1
We	1
card	1
portfolio	1
cards	1
count	1
catalog	1
according	1
force	1
muan	1
responsible	1
Postal	1
ceh	1
YEAR	1
floatingpoint	1
column	1
centered	1
ZWQM	1
financing	1
please	1
finally	1
mol	1
files	1
except	1
figure	1
above	1
fewest	1
actually	1
fail	1
respect	1
absolute	1
entries	1
expressed	1
Pizza	1
doing	1
allocations	1
design	1
several	1
reverse	1
postage	1
group	1
greater	2
going	2
give	2
pop	2
point	2
follows	2
following	2
seven	2
respectively	2
pizza	2
message	2
mental	2
eznab	2
mem	2
erosion	2
mapping	2
sets	2
person	2
manik	2
make	2
period	2
edges	2
eb	2
duplicates	2
see	2
lost	2
due	2
dollar	2
penny	2
dialing	2
life	2
described	2
letter	2
lengths	2
pair	2
learned	2
large	2
lamat	2
know	2
other	2
kan	2
ix	2
inversions	2
old	2
intellectual	2
ok	2
series	2
indicating	2
indicates	2
included	2
nonnegative	2
sequences	2
none	2
hyphens	2
denomination	2
right	2
house	2
new	2
necessary	2
had	2
sign	2
smaller	2
list	2
software	2
muluk	2
spell	2
start	2
such	2
take	2
their	2
them	2
then	2
times	2
very	2
write	2
where	2
width	2
within	2
word	2
would	2
zeros	2
One	2
OF	2
between	2
ben	2
corresponding	2
being	2
RLE	2
Output	2
After	2
Q	2
both	2
As	2
T	2
TUTGLOP	2
G	2
consisting	2
Louisiana	2
businesses	2
They	2
considering	2
GINO	2
Your	2
Use	2
Ya	2
computation	2
compressed	2
Year	2
caban	2
bank	2
columns	2
Input	2
again	2
END	2
axis	2
average	2
allocation	2
alone	2
cib	2
C	2
D	2
cimi	2
characters	2
ahau	2
also	2
chuen	2
another	2
circle	2
chicchan	2
appear	2
cases	2
Dont	2
appears	2
Help	2
canac	2
E	3
giving	3
closing	3
twelve	3
exact	3
even	3
emotional	3
c	3
way	3
but	3
determine	3
time	3
consist	3
decimal	3
how	3
ik	3
contains	3
d	3
your	3
cycle	3
process	3
property	3
physical	3
below	3
peaks	3
may	3
No	3
akbal	3
been	3
out	3
all	3
string	3
integers	3
Since	3
these	3
balance	3
account	3
measure	3
M	3
There	3
issue	3
These	3
occur	3
professor	3
single	3
last	3
space	3
long	3
m	3
names	3
must	3
Y	3
answer	3
miles	3
still	3
month	3
current	3
used	4
like	4
want	4
test	4
least	4
called	4
they	4
Fred	4
exactly	4
imix	4
combination	4
length	4
many	4
i	4
its	4
people	4
periods	4
money	4
All	4
part	4
program	4
were	4
p	4
pixel	4
needs	4
customer	4
only	4
Larry	4
Maya	4
containing	4
X	4
consists	4
pixels	4
denoted	4
stamp	5
any	5
most	5
e	5
directory	5
letters	5
file	5
end	5
tie	5
N	5
semicircle	5
customers	5
land	5
best	5
occurs	5
he	5
can	5
RPS	5
R	5
four	6
standard	6
format	6
If	6
calendar	6
contain	6
there	6
than	6
beginning	6
Z	6
strings	6
memorable	6
images	6
set	6
In	6
digits	6
dates	6
months	6
sorted	6
Tzolkin	6
two	6
if	7
pairs	7
it	7
data	7
This	7
positive	7
You	7
followed	7
maximum	7
A	7
Haab	7
values	7
which	7
day	8
triple	8
cycles	8
sequence	8
Each	8
on	8
more	8
types	8
value	8
no	8
map	8
this	8
date	9
different	9
order	9
next	9
three	9
case	9
integer	9
was	9
lines	10
example	10
same	10
form	10
has	10
one	10
numbers	11
should	11
not	11
an	11
peak	11
n	11
stamps	11
print	11
have	11
his	12
first	12
with	13
image	13
you	13
or	13
given	13
from	14
For	14
telephone	14
year	14
days	15
for	15
at	16
output	16
as	18
each	19
are	21
by	21
input	23
that	24
line	26
be	33
will	33
The	41
number	42
and	58
is	60
in	61
to	63
a	65
of	129
the	226


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值