1 共同朋友
思路:例如A,他的朋友是B\C\D\E\F\,那么BC的共同朋友就是A。所以将BC作为key,将A作为value,在map端输出即可!其他的朋友循环处理。
import java.io.IOException; 2. import java.util.Set; 3. import java.util.StringTokenizer; 4. import java.util.TreeSet; 5.
6. import org.apache.hadoop.conf.Configuration; 7. import org.apache.hadoop.fs.Path; 8. import org.apache.hadoop.io.Text; 9. import org.apache.hadoop.mapreduce.Job; 10. import org.apache.hadoop.mapreduce.Mapper; 11. import org.apache.hadoop.mapreduce.Reducer; 12. import org.apache.hadoop.mapreduce.Mapper.Context; 13. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15. import org.apache.hadoop.util.GenericOptionsParser; 16. 17. public class FindFriend { www.aboutyun.com 18. 19. public static class ChangeMapper extends Mapper<Object, Text, Text, Text>{ 20. @Override 21. public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 22. StringTokenizer itr = new StringTokenizer(value.toString()); 23. Text owner = new Text(); 24. Set<String> set = new TreeSet<String>(); 25. owner.set(itr.nextToken()); 26. while (itr.hasMoreTokens()) { 27. set.add(itr.nextToken()); 28. } 29. String[] friends = new String[set.size()]; 30. friends = set.toArray(friends); 31. 32. for(int i=0;i<friends.length;i++){ 33. for(int j=i+1;j<friends.length;j++){ 34. String outputkey = friends[i]+friends[j]; 35. context.write(new Text(outputkey),owner); 36. } 37. } 38. } 39. } 40. 41. public static class FindReducer extends Reducer<Text,Text,Text,Text> { 42. public void reduce(Text key, Iterable<Text> values, 43. Context context) throws IOException, InterruptedException { 44. String commonfriends =""; www.aboutyun.com 45. for (Text val : values) { 46. if(commonfriends == ""){ 47. commonfriends = val.toString(); 48. }else{ 49. commonfriends = commonfriends+":"+val.toString(); 50. } 51. } 52. context.write(key, new Text(commonfriends)); 53. } 54. } 55. 56. 57. public static void main(String[] args) throws IOException, 58. InterruptedException, ClassNotFoundException { 59. 60. Configuration conf = new Configuration(); 61. String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 62. if (otherArgs.length < 2) { 63. System.err.println("args error"); 64. System.exit(2); 65. } 66. Job job = new Job(conf, "word count"); 67. job.setJarByClass(FindFriend.class); 68. job.setMapperClass(ChangeMapper.class); 69. job.setCombinerClass(FindReducer.class); 70. job.setReducerClass(FindReducer.class); 71. job.setOutputKeyClass(Text.class); 72. job.setOutputValueClass(Text.class); 73. for (int i = 0; i < otherArgs.length - 1; ++i) { www.aboutyun.com 74. FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 75. } 76. FileOutputFormat.setOutputPath(job, 77. new Path(otherArgs[otherArgs.length - 1])); 78. System.exit(job.waitForCompletion(true) ? 0 : 1); 79. 80. } 81. 82. } |
结果:
1. AB E:C:D 2. AC E:B 3. AD B:E 4. AE C:B:D 5. BC A:E 6. BD A:E 7. BE C:D:A 8. BF A 9. CD E:A:B 10. CE A:B 11. CF A 12. DE B:A 13. DF A 14. EF A |
2 基站逗留时间
需求:
期望:
思路:
将数据导入hive表中,查询时,用电话号码和时间排序即可!
3 脚本替换
脚本:随意命名为aaa.sh
#!/bin/bash ls $1 | while read line do sed -i 's,\$HADOOP_HOME\$,\/home\/aa,g' $1$line echo $1$line done |
脚本执行命令:替换/home/hadoop/test/下的所有文件
./aaa.sh /home/hadoop/test/
4 一键执行
脚本:
vi runRemoteCmd.sh
#!/bin/bash $1 ssh -q hadoop@slave1 "$1" ssh -q hadoop@slave2 "$1" |
执行命令
./runRemoteCmd.sh "ls -l" |
请用java实现非递归二分查询
1. public class BinarySearchClass
2. {
3.
4. public static int binary_search(int[] array, int value)
5. {
6. int beginIndex = 0;// 低位下标
7. int endIndex = array.length - 1;// 高位下标
8. int midIndex = -1;
9. while (beginIndex <= endIndex) {
10. midIndex = beginIndex + (endIndex - beginIndex) / 2;//防止溢出
11. if (value == array[midIndex]) {
12. return midIndex;
13. } else if (value < array[midIndex]) {
14. endIndex = midIndex - 1;
15. } else {
16. beginIndex = midIndex + 1;
17. }
18. }
19. return -1;
20. //找到了,返回找到的数值的下标,没找到,返回-1
21. }
22.
23.
24. //start 提示:自动阅卷起始唯一标识,请勿删除或增加。
25. public static void main(String[] args)
26. {
27. System.out.println("Start...");
28. int[] myArray = new int[] { 1, 2, 3, 5, 6, 7, 8, 9 };
29. System.out.println("查找数字8的下标:");
30. System.out.println(binary_search(myArray, 8));
31. }
32. //end //提示:自动阅卷结束唯一标识,请勿删除或增加。
33. }
4.0 当前日志采样格式为
a , b , c , d
b , b , f , e
a , a , c , f
请你用最熟悉的语言编写mapreduce,计算第四列每个元素出现的个数
答:
public classWordCount1 {
public static final String INPUT_PATH ="hdfs://hadoop0:9000/in";
public static final String OUT_PATH ="hdfs://hadoop0:9000/out";
public static void main(String[] args)throws Exception {
Configuration conf = newConfiguration();
FileSystem fileSystem =FileSystem.get(conf);
if(fileSystem.exists(newPath(OUT_PATH))){}
fileSystem.delete(newPath(OUT_PATH),true);
Job job = newJob(conf,WordCount1.class.getSimpleName());
//1.0读取文件,解析成key,value对
FileInputFormat.setInputPaths(job,newPath(INPUT_PATH));
//2.0写上自己的逻辑,对输入的可以,value进行处理,转换成新的key,value对进行输出
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//3.0对输出后的数据进行分区
//4.0对分区后的数据进行排序,分组,相同key的value放到一个集合中
//5.0对分组后的数据进行规约
//6.0对通过网络将map输出的数据拷贝到reduce节点
//7.0 写上自己的reduce函数逻辑,对map输出的数据进行处理
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job,new Path(OUT_PATH));
job.waitForCompletion(true);
}
static class MyMapper extendsMapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable k1,Text v1,
org.apache.hadoop.mapreduce.Mapper.Contextcontext)
throws IOException,InterruptedException {
String[] split =v1.toString().split("\t");
for(String words :split){
context.write(split[3],1);
}
}
}
static class MyReducer extendsReducer<Text, LongWritable, Text, LongWritable>{
protected void reduce(Text k2,Iterable<LongWritable> v2,
org.apache.hadoop.mapreduce.Reducer.Contextcontext)
throws IOException,InterruptedException {
Long count = 0L;
for(LongWritabletime : v2){
count += time.get();
}
context.write(v2, newLongWritable(count));
}
}
}
1 使用Hive或者自定义MR实现如下逻辑
product_no lac_id moment start_time user_id county_id staytime city_id
13429100031 22554 8 2013-03-11 08:55:19.151754088 571 571 282 571
13429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 571
13429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 103 571
13429100087 22705 8 2013-03-11 08:56:51.139539816 571 571 220 571
13429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 571
13429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 571
13429100140 26642 9 2013-03-11 09:02:19.151754088 571 571 18 571
13429100082 22691 8 2013-03-11 08:57:32.151754088 571 571 287 571
13429100189 22558 8 2013-03-11 08:56:24.139539816 571 571 48 571
13429100349 22503 8 2013-03-11 08:54:30.152622440 571 571 211 571
字段解释:
product_no:用户手机号;
lac_id:用户所在基站;
start_time:用户在此基站的开始时间;
staytime:用户在此基站的逗留时间。
需求描述:
根据lac_id和start_time知道用户当时的位置,根据staytime知道用户各个基站的逗留时长。根据轨迹合并连续基站的staytime。
最终得到每一个用户按时间排序在每一个基站驻留时长
期望输出举例:
13429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 571
13429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 390 571
13429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 571
13429100087 22705 8 2013-03-11 08:56:51.139539816 571 571 220 571
13429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 571
答案
package org.aboutyun;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.input.TextInputFormat;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
public class TimeCount {
publicstatic void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Jobjob = new Job(conf, "time_count");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
publicstatic class Map extends Mapper<LongWritable, Text, Text, Text> {
private Text id = new Text();
private Text row = new Text();
public void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {
String line = value.toString();
String[] items = line.split("\t");
if (items.length == 8) {
if (StringUtils.isNumeric(items[6])) {
id.set(items[0] +"\t" + items[1]);
row.set(line);
context.write(id, row);
}
} else {
System.out.println("Wrong length: " + items.length);
}
}
}
publicstatic class Reduce extends Reducer<Text, Text, Text, Text> {
private static final SimpleDateFormat format = newSimpleDateFormat("yyyy-MM-dd HH:mm:ss");
static {
format.setLenient(false);
}
private Text rest = new Text();
public void reduce(Text key, Iterable<Text> values, Contextcontext)
throws IOException, InterruptedException {
// Parse row to Record
ArrayList<Record> list = new ArrayList<Record>();
for (Text row : values) {
String[] items = row.toString().split("\t");
try {
Record record = newRecord();
record.items = items;
record.start_time =format.parse(items[3]).getTime();
record.stay_time =Long.parseLong(items[6]) * 1000;
list.add(record);
} catch (ParseException e) {
e.printStackTrace();
}
}
// Sort
Collections.sort(list, newComparator<Record>() {
@Override
public int compare(Record r1, Record r2) {
return (int) (r1.start_time- r2.start_time);
}
});
// Find and merge slice
ArrayList<Record> result = new ArrayList<Record>();
for (Record r1 : list) {
boolean found = false;
long r1_stop_time = r1.start_time + r1.stay_time;
for (Record r2 : result) {
long r2_stop_time =r2.start_time + r2.stay_time;
if (r1.start_time >r2.start_time && r1.start_time <= r2_stop_time &&r1_stop_time > r2_stop_time) {
// merge the new slice
r2.stay_time =r1_stop_time - r2.start_time;
found = true;
}
}
if (!found) {
result.add(r1);
}
}
// Output
for (Record r : result) {
key.set(r.items[0]);
String value = r.items[1] + "\t"
+ r.items[2] +"\t"
+ r.items[3] +"\t"
+ r.items[4] + "\t"
+ r.items[5] +"\t"
+ (r.stay_time / 1000)+ "\t"
+ r.items[6] +"\t";
rest.set(value);
context.write(key, rest);
}
}
static class Record {
String[] items;
long start_time;
long stay_time;
}
}
}
2 Linux脚本能力考察
2.1 请随意使用各种类型的脚本语言实现:批量将指定目录下的所有文件中的$HADOOP_HOME$替换成/home/ocetl/app/hadoop
2.2 假设有10台主机,H1到H10,在开启SSH互信的情况下,编写一个或多个脚本实现在所有的远程主机上执行脚本的功能
例如:runRemoteCmd.sh "ls -l"
期望结果:
H1:
XXXXXXXX
XXXXXXXX
XXXXXXXX
H2:
XXXXXXXX
XXXXXXXX
XXXXXXXX
H3:
...
答案
2.1 使用 find + sed 来实现:
find /home/ocetl/app/hadoop -exec sed -i's/\$HADOOP_HOME\$/\/home\/ocetl\/app\/hadoop/g' {} \;
2.2 直接使用ssh的参数
1. #!/bin/bash
2. if [ $# -ne 1 ]
3. then
4. echo "Usage:`basename $0` {command}"
5. exit
6. fi
7.
8. for i in H1 H2 H3 H4 H5 H6 H7 H8 H9 H10
9. do
10. echo"$i:"
11. ssh $i"$1"
12. done
复制代码
3 Hadoop基础知识与问题分析的能力
3.1 描述一下hadoop中,有哪些地方使用了缓存机制,作用分别是什么
3.2 请描述https://issues.apache.org/jira/browse/HDFS-2379说的是什么问题,最终解决的思路是什么?
3.1 不了解,HDFS用了缓存
3.2 问题是当硬盘空间很大,而内存页面缓存很少的时候,DN的Block report需要很长时间生成,而此时 FSVolumeSet 锁是锁住的,因此所有读写操作都无法执行,最终导致那些操作超时。此问题是建议提供一种方法使block report不需要持有FSVolumeSet锁,从而不会导致那些任务失败。
4 MapReduce开发能力
请参照wordcount实现一个自己的map reduce,需求为:
a 输入文件格式:
xxx,xxx,xxx,xxx,xxx,xxx,xxx
b 输出文件格式:
xxx,20
xxx,30
xxx.40
c 功能:根据命令行参数统计输入文件中指定关键字出现的次数,并展示出来
例如:hadoop jar xxxxx.jar keywordcount xxx,xxx,xxx,xxx(四个关键字)
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.input.TextInputFormat;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
public class WordCount {
publicstatic class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private final static ArrayList<String> target_words = newArrayList<String>();
public void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {
String[] items =value.toString().toLowerCase().replaceAll("\\p{Punct}","").split("\\s+");
for (String item : items) {
if (target_words.contains(item)) {
word.set(item);
context.write(word, one);
}
}
}
public static void clear() {
target_words.clear();
}
public static void add(String word) {
target_words.add(word);
}
}
publicstatic class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{
public void reduce(Text key, Iterable<IntWritable> values, Contextcontext)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
publicstatic void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if(args.length < 3) {
System.out.println("Usage: wordcount <input_path><output_path> <keyword_list>");
return;
}
// Add to target
String[] target_words = args[2].split(",");
for(String word : target_words) {
Map.add(word.toLowerCase());
}
Jobjob = new Job(conf, "wordcount");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
5 MapReduce优化
请根据第五题中的程序, 提出如何优化MR程序运行速度的思路
6 Linux操作系统知识考察
请列举曾经修改过的/etc下的配置文件,并说明修改要解决的问题?
hosts:增加局域网主机名和ip对应关系,省得再记住ip;
hostname:该主机名,克隆虚拟机的时候经常需要这么做;
fstab:修改挂载点,加新硬盘的时候会需要;
profile,bash.bashrc: 修改系统范围环境变量时经常用;
network/interfaces:配置静态IP时需要。
7 Java开发能力
7.1 写代码实现1G大小的文本文件,行分隔符为\x01\x02,统计一下该文件中的总行数,要求注意边界情况的处理
7.2 请描述一下在开发中如何对上面的程序进行性能分析,对性能进行优化的过程
1. package org.aboutyun;
2.
3. import java.io.BufferedReader;
4. import java.io.FileNotFoundException;
5. import java.io.FileReader;
6. import java.io.IOException;
7.
8. public class LineCounter {
9. public static void main(String[] args) {
10. try {
11. BufferedReader reader = new BufferedReader(new FileReader(args[0]));
12. char[] buffer = new char[4096];
13. intcount;
14. charlast = 0;
15. longline_count = 0;
16. while((count = reader.read(buffer)) >= 0) {
17. if (count > 0 && line_count == 0) {
18. // has something infile, so at least 1 line.
19. line_count = 1;
20. }
21.
22. for (int i = 0; i < count ; ++i) {
23. char c = buffer[i];
24. if (c == 0x02) {
25. if (i == 0 &&last == 0x01) {
26. // buffer split the 0x01,0x02
27. ++line_count;
28. } else if(buffer[i-1] == 0x01) {
29. // normal one
30. ++line_count;
31. }
32. }
33. }
34.
35. // keep the last one
36. last = buffer[count-1];
37. }
38.
39. System.out.println(line_count);
40. } catch(FileNotFoundException e) {
41. e.printStackTrace();
42. } catch(IOException e) {
43. e.printStackTrace();
44. }
45. }
46. }