hadoop之编程题

1        共同朋友

思路:例如A,他的朋友是B\C\D\E\F\,那么BC的共同朋友就是A。所以将BC作为key,将A作为value,在map端输出即可!其他的朋友循环处理。

import java.io.IOException;

2.

import java.util.Set;

3.

import java.util.StringTokenizer;

4.

import java.util.TreeSet;

5.

 

6.

import org.apache.hadoop.conf.Configuration;

7.

import org.apache.hadoop.fs.Path;

8.

import org.apache.hadoop.io.Text;

9.

import org.apache.hadoop.mapreduce.Job;

10. import org.apache.hadoop.mapreduce.Mapper;

11. import org.apache.hadoop.mapreduce.Reducer;

12. import org.apache.hadoop.mapreduce.Mapper.Context;

13. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

14. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

15. import org.apache.hadoop.util.GenericOptionsParser;

16. 

17. public class FindFriend {

www.aboutyun.com

18.         

19.           public static class ChangeMapper extends Mapper<Object, Text, Text,

Text>{                      

20.                    @Override

21.                    public void map(Object key, Text value, Context context) throws

IOException, InterruptedException {

22.                              StringTokenizer itr = new StringTokenizer(value.toString());

23.                                  Text owner = new Text();

24.                                  Set<String> set = new TreeSet<String>();

25.                              owner.set(itr.nextToken());

26.                              while (itr.hasMoreTokens()) {

27.                                      set.add(itr.nextToken());

28.                              }             

29.                              String[] friends = new String[set.size()];

30.                              friends = set.toArray(friends);

31.                              

32.                              for(int i=0;i<friends.length;i++){

33.                                      for(int j=i+1;j<friends.length;j++){

34.                                              String outputkey = friends[i]+friends[j];

35.                                              context.write(new Text(outputkey),owner);

36.                                      }                                     

37.                              }

38.                    }

39.           }

40.           

41.           public static class FindReducer extends Reducer<Text,Text,Text,Text>

{                           

42.                         public void reduce(Text key, Iterable<Text> values, 

43.                                         Context context) throws IOException,

InterruptedException {

44.                                   String  commonfriends =""; 

www.aboutyun.com

45.                               for (Text val : values) {

46.                                   if(commonfriends == ""){

47.                                           commonfriends = val.toString();

48.                                   }else{

49.                                           commonfriends =

commonfriends+":"+val.toString();

50.                                   }

51.                                }

52.                               context.write(key, new

Text(commonfriends));                                

53.                         }                          

54.           }

55.           

56. 

57.         public static void main(String[] args) throws IOException,

58.         InterruptedException, ClassNotFoundException {

59.                 

60.             Configuration conf = new Configuration();

61.             String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

62.             if (otherArgs.length < 2) {

63.               System.err.println("args error");

64.               System.exit(2);

65.             }

66.             Job job = new Job(conf, "word count");

67.             job.setJarByClass(FindFriend.class);

68.             job.setMapperClass(ChangeMapper.class);

69.             job.setCombinerClass(FindReducer.class);

70.             job.setReducerClass(FindReducer.class);

71.             job.setOutputKeyClass(Text.class);

72.             job.setOutputValueClass(Text.class);

73.             for (int i = 0; i < otherArgs.length - 1; ++i) {

www.aboutyun.com

74.               FileInputFormat.addInputPath(job, new Path(otherArgs[i]));

75.             }

76.             FileOutputFormat.setOutputPath(job,

77.               new Path(otherArgs[otherArgs.length - 1]));

78.             System.exit(job.waitForCompletion(true) ? 0 : 1);

79.                 

80.         }

81. 

82. }

结果:

1. AB      E:C:D

2. AC      E:B

3. AD      B:E

4. AE      C:B:D

5. BC      A:E

6. BD      A:E

7. BE      C:D:A

8. BF      A

9. CD      E:A:B

10. CE      A:B

11. CF      A

12. DE      B:A

13. DF      A

14. EF      A

2        基站逗留时间

需求:

期望:

思路:

将数据导入hive表中,查询时,用电话号码和时间排序即可!

3        脚本替换

脚本:随意命名为aaa.sh

#!/bin/bash

ls $1 | while read line

do

sed -i 's,\$HADOOP_HOME\$,\/home\/aa,g' $1$line

echo $1$line

done

脚本执行命令:替换/home/hadoop/test/下的所有文件

./aaa.sh /home/hadoop/test/

4        一键执行

脚本:

vi runRemoteCmd.sh

#!/bin/bash

$1

ssh -q hadoop@slave1 "$1"

ssh -q hadoop@slave2 "$1"

执行命令

./runRemoteCmd.sh "ls -l"

 

 

 

请用java实现非递归二分查询

1.   public class BinarySearchClass  

2.   {  

3.     

4.       public static int binary_search(int[] array, int value)  

5.       {  

6.           int beginIndex = 0;// 低位下标  

7.           int endIndex = array.length - 1;// 高位下标  

8.           int midIndex = -1;  

9.           while (beginIndex <= endIndex) {  

10.              midIndex = beginIndex + (endIndex - beginIndex) / 2;//防止溢出  

11.              if (value == array[midIndex]) {  

12.                  return midIndex;  

13.              } else if (value < array[midIndex]) {  

14.                  endIndex = midIndex - 1;  

15.              } else {  

16.                  beginIndex = midIndex + 1;  

17.              }  

18.          }  

19.          return -1;  

20.          //找到了,返回找到的数值的下标,没找到,返回-1         

21.      }  

22.    

23.    

24.      //start 提示:自动阅卷起始唯一标识,请勿删除或增加。  

25.      public static void main(String[] args)  

26.      {  

27.          System.out.println("Start...");  

28.          int[] myArray = new int[] { 1, 2, 3, 5, 6, 7, 8, 9 };  

29.          System.out.println("查找数字8的下标:");  

30.          System.out.println(binary_search(myArray, 8));          

31.      }  

32.      //end //提示:自动阅卷结束唯一标识,请勿删除或增加。  

33.  }     

 

4.0   当前日志采样格式为

           a , b , c , d

           b , b , f , e

           a , a , c , f        

请你用最熟悉的语言编写mapreduce,计算第四列每个元素出现的个数

 

答:

public classWordCount1 {

      public static final String INPUT_PATH ="hdfs://hadoop0:9000/in";

      public static final String OUT_PATH ="hdfs://hadoop0:9000/out";

      public static void main(String[] args)throws Exception {

            Configuration conf = newConfiguration();

            FileSystem fileSystem =FileSystem.get(conf);

            if(fileSystem.exists(newPath(OUT_PATH))){}

            fileSystem.delete(newPath(OUT_PATH),true);

            Job job = newJob(conf,WordCount1.class.getSimpleName());

            //1.0读取文件,解析成key,value对

            FileInputFormat.setInputPaths(job,newPath(INPUT_PATH));

            //2.0写上自己的逻辑,对输入的可以,value进行处理,转换成新的key,value对进行输出

            job.setMapperClass(MyMapper.class);

            job.setMapOutputKeyClass(Text.class);

            job.setMapOutputValueClass(LongWritable.class);

            //3.0对输出后的数据进行分区

            //4.0对分区后的数据进行排序,分组,相同key的value放到一个集合中

            //5.0对分组后的数据进行规约

            //6.0对通过网络将map输出的数据拷贝到reduce节点

            //7.0 写上自己的reduce函数逻辑,对map输出的数据进行处理

            job.setReducerClass(MyReducer.class);

            job.setOutputKeyClass(Text.class);

            job.setOutputValueClass(LongWritable.class);

            FileOutputFormat.setOutputPath(job,new Path(OUT_PATH));

            job.waitForCompletion(true);

      }

      static class MyMapper extendsMapper<LongWritable, Text, Text, LongWritable>{

            @Override

            protected void map(LongWritable k1,Text v1,

                       org.apache.hadoop.mapreduce.Mapper.Contextcontext)

                       throws IOException,InterruptedException {

                  String[] split =v1.toString().split("\t");

                  for(String words :split){

                       context.write(split[3],1);

                  }

            }

      }

      static class MyReducer extendsReducer<Text, LongWritable, Text, LongWritable>{

           

            protected void reduce(Text k2,Iterable<LongWritable> v2,

                       org.apache.hadoop.mapreduce.Reducer.Contextcontext)

                       throws IOException,InterruptedException {

                  Long count = 0L;

                  for(LongWritabletime : v2){

                       count += time.get();

                  }

                  context.write(v2, newLongWritable(count));

            }

      }

}

 

1 使用Hive或者自定义MR实现如下逻辑

product_no     lac_id  moment  start_time     user_id county_id       staytime       city_id

13429100031     22554   8      2013-03-11 08:55:19.151754088   571    571     282     571

13429100082     22540   8      2013-03-11 08:58:20.152622488   571    571     270     571

13429100082     22691   8      2013-03-11 08:56:37.149593624   571    571     103     571

13429100087     22705   8      2013-03-11 08:56:51.139539816   571    571     220     571

13429100087     22540   8      2013-03-11 08:55:45.150276800   571    571     66      571

13429100082     22540   8      2013-03-11 08:55:38.140225200   571    571     133     571

13429100140     26642   9      2013-03-11 09:02:19.151754088   571    571     18      571

13429100082     22691   8      2013-03-11 08:57:32.151754088   571    571     287     571

13429100189     22558   8      2013-03-11 08:56:24.139539816   571    571     48      571

13429100349     22503   8      2013-03-11 08:54:30.152622440   571    571     211     571

字段解释:

product_no:用户手机号;

lac_id:用户所在基站;

start_time:用户在此基站的开始时间;

staytime:用户在此基站的逗留时间。

 

需求描述:

根据lac_idstart_time知道用户当时的位置,根据staytime知道用户各个基站的逗留时长。根据轨迹合并连续基站的staytime

最终得到每一个用户按时间排序在每一个基站驻留时长

 

期望输出举例:

13429100082     22540   8      2013-03-11 08:58:20.152622488   571    571     270     571

13429100082     22691   8      2013-03-11 08:56:37.149593624   571    571     390     571

13429100082     22540   8      2013-03-11 08:55:38.140225200   571    571     133     571

13429100087     22705   8      2013-03-11 08:56:51.139539816   571    571     220     571

13429100087     22540   8      2013-03-11 08:55:45.150276800   571    571     66      571

答案

package org.aboutyun;

 

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;

importorg.apache.hadoop.mapreduce.lib.input.TextInputFormat;

importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

 

import java.io.IOException;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

 

public class TimeCount {

    publicstatic void main(String[] args) throws Exception {

       Configuration conf = new Configuration();

 

        Jobjob = new Job(conf, "time_count");

 

       job.setOutputKeyClass(Text.class);

       job.setOutputValueClass(Text.class);

 

       job.setMapperClass(Map.class);

       job.setReducerClass(Reduce.class);

 

       job.setInputFormatClass(TextInputFormat.class);

       job.setOutputFormatClass(TextOutputFormat.class);

 

       FileInputFormat.addInputPath(job, new Path(args[0]));

       FileOutputFormat.setOutputPath(job, new Path(args[1]));

 

       job.waitForCompletion(true);

    }

 

    publicstatic class Map extends Mapper<LongWritable, Text, Text, Text> {

       private Text id = new Text();

       private Text row = new Text();

 

       public void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {

           String line = value.toString();

           String[] items = line.split("\t");

 

           if (items.length == 8) {

               if (StringUtils.isNumeric(items[6])) {

                    id.set(items[0] +"\t" + items[1]);

                    row.set(line);

                    context.write(id, row);

               }

           } else {

               System.out.println("Wrong length: " + items.length);

           }

        }

    }

 

    publicstatic class Reduce extends Reducer<Text, Text, Text, Text> {

       private static final SimpleDateFormat format = newSimpleDateFormat("yyyy-MM-dd HH:mm:ss");

 

       static {

           format.setLenient(false);

        }

 

       private Text rest = new Text();

 

       public void reduce(Text key, Iterable<Text> values, Contextcontext)

               throws IOException, InterruptedException {

           //  Parse row to Record

           ArrayList<Record> list = new ArrayList<Record>();

           for (Text row : values) {

               String[] items = row.toString().split("\t");

               try {

                    Record record = newRecord();

                    record.items = items;

                    record.start_time =format.parse(items[3]).getTime();

                    record.stay_time =Long.parseLong(items[6]) * 1000;

                    list.add(record);

               } catch (ParseException e) {

                    e.printStackTrace();

               }

 

           }

 

           //  Sort

            Collections.sort(list, newComparator<Record>() {

               @Override

               public int compare(Record r1, Record r2) {

                    return (int) (r1.start_time- r2.start_time);

               }

           });

 

            // Find and merge slice

           ArrayList<Record> result = new ArrayList<Record>();

           for (Record r1 : list) {

               boolean found = false;

               long r1_stop_time = r1.start_time + r1.stay_time;

               for (Record r2 : result) {

                    long r2_stop_time =r2.start_time + r2.stay_time;

                    if (r1.start_time >r2.start_time && r1.start_time <= r2_stop_time &&r1_stop_time > r2_stop_time) {

                        //  merge the new slice

                        r2.stay_time =r1_stop_time - r2.start_time;

                        found = true;

                    }

               }

 

               if (!found) {

                    result.add(r1);

               }

           }

 

           //  Output

           for (Record r : result) {

               key.set(r.items[0]);

 

               String value = r.items[1] + "\t"

                        + r.items[2] +"\t"

                        + r.items[3] +"\t"

                       + r.items[4] + "\t"

                        + r.items[5] +"\t"

                        + (r.stay_time / 1000)+ "\t"

                        + r.items[6] +"\t";

               rest.set(value);

 

               context.write(key, rest);

            }

 

        }

 

       static class Record {

           String[] items;

           long start_time;

           long stay_time;

        }

    }

}

 

 

2 Linux脚本能力考察

2.1 请随意使用各种类型的脚本语言实现:批量将指定目录下的所有文件中的$HADOOP_HOME$替换成/home/ocetl/app/hadoop

 

2.2 假设有10台主机,H1H10,在开启SSH互信的情况下,编写一个或多个脚本实现在所有的远程主机上执行脚本的功能

例如:runRemoteCmd.sh "ls -l"

期望结果:

H1:

XXXXXXXX

XXXXXXXX

XXXXXXXX

H2:

XXXXXXXX

XXXXXXXX

XXXXXXXX

H3:

...

答案

2.1 使用 find + sed 来实现:
find /home/ocetl/app/hadoop -exec sed -i's/\$HADOOP_HOME\$/\/home\/ocetl\/app\/hadoop/g' {} \;
2.2
直接使用ssh的参数

1.  #!/bin/bash

2.  if [ $# -ne 1 ]

3.  then

4.          echo "Usage:`basename $0` {command}"

5.          exit

6.  fi

7.   

8.  for i in H1 H2 H3 H4 H5 H6 H7 H8 H9 H10

9.  do

10.         echo"$i:"

11.         ssh $i"$1"

12. done

复制代码






3 Hadoop基础知识与问题分析的能力

3.1 描述一下hadoop中,有哪些地方使用了缓存机制,作用分别是什么

 

3.2 请描述https://issues.apache.org/jira/browse/HDFS-2379说的是什么问题,最终解决的思路是什么?

3.1 不了解,HDFS用了缓存
3.2
问题是当硬盘空间很大,而内存页面缓存很少的时候,DNBlock report需要很长时间生成,而此时 FSVolumeSet 锁是锁住的,因此所有读写操作都无法执行,最终导致那些操作超时。此问题是建议提供一种方法使block report不需要持有FSVolumeSet锁,从而不会导致那些任务失败。

4 MapReduce开发能力

请参照wordcount实现一个自己的map reduce,需求为:

    a 输入文件格式:

      xxx,xxx,xxx,xxx,xxx,xxx,xxx

    b 输出文件格式:

      xxx,20

      xxx,30

      xxx.40

    c 功能:根据命令行参数统计输入文件中指定关键字出现的次数,并展示出来

      例如:hadoop jar xxxxx.jar keywordcount xxx,xxx,xxx,xxx(四个关键字)

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;

importorg.apache.hadoop.mapreduce.lib.input.TextInputFormat;

importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

 

import java.io.IOException;

import java.util.ArrayList;

 

public class WordCount {

 

    publicstatic class Map extends Mapper<LongWritable, Text, Text, IntWritable> {

       private final static IntWritable one = new IntWritable(1);

       private Text word = new Text();

       private final static ArrayList<String> target_words = newArrayList<String>();

 

       public void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {

           String[] items =value.toString().toLowerCase().replaceAll("\\p{Punct}","").split("\\s+");

           for (String item : items) {

               if (target_words.contains(item)) {

                    word.set(item);

                    context.write(word, one);

               }

           }

        }

 

       public static void clear() {

           target_words.clear();

        }

 

       public static void add(String word) {

           target_words.add(word);

        }

    }

 

    publicstatic class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{

 

       public void reduce(Text key, Iterable<IntWritable> values, Contextcontext)

               throws IOException, InterruptedException {

           int sum = 0;

           for (IntWritable val : values) {

               sum += val.get();

           }

           context.write(key, new IntWritable(sum));

        }

    }

 

    publicstatic void main(String[] args) throws Exception {

       Configuration conf = new Configuration();

 

        if(args.length < 3) {

           System.out.println("Usage: wordcount <input_path><output_path> <keyword_list>");

           return;

        }

 

       //  Add to target

       String[] target_words = args[2].split(",");

        for(String word : target_words) {

           Map.add(word.toLowerCase());

        }

 

        Jobjob = new Job(conf, "wordcount");

 

       job.setOutputKeyClass(Text.class);

       job.setOutputValueClass(IntWritable.class);

 

       job.setMapperClass(Map.class);

       job.setReducerClass(Reduce.class);

 

       job.setInputFormatClass(TextInputFormat.class);

       job.setOutputFormatClass(TextOutputFormat.class);

 

       FileInputFormat.addInputPath(job, new Path(args[0]));

       FileOutputFormat.setOutputPath(job, new Path(args[1]));

 

       job.waitForCompletion(true);

    }

 

}

5 MapReduce优化

请根据第五题中的程序, 提出如何优化MR程序运行速度的思路

 

6 Linux操作系统知识考察

请列举曾经修改过的/etc下的配置文件,并说明修改要解决的问题?

hosts:增加局域网主机名和ip对应关系,省得再记住ip;
        hostname
:该主机名,克隆虚拟机的时候经常需要这么做;
        fstab
:修改挂载点,加新硬盘的时候会需要;
        profile,bash.bashrc:
修改系统范围环境变量时经常用;
       network/interfaces
:配置静态IP时需要。

7 Java开发能力

7.1 写代码实现1G大小的文本文件,行分隔符为\x01\x02,统计一下该文件中的总行数,要求注意边界情况的处理

 

7.2 请描述一下在开发中如何对上面的程序进行性能分析,对性能进行优化的过程

1.  package org.aboutyun;

2.   

3.  import java.io.BufferedReader;

4.  import java.io.FileNotFoundException;

5.  import java.io.FileReader;

6.  import java.io.IOException;

7.   

8.  public class LineCounter {

9.      public static void main(String[] args) {

10.         try {

11.            BufferedReader reader = new BufferedReader(new FileReader(args[0]));

12.            char[] buffer = new char[4096];

13.             intcount;

14.             charlast = 0;

15.             longline_count = 0;

16.            while((count = reader.read(buffer)) >= 0) {

17.                if (count > 0 && line_count == 0) {

18.                    //  has something infile, so at least 1 line.

19.                    line_count = 1;

20.                }

21.  

22.                for (int i = 0; i < count ; ++i) {

23.                    char c = buffer[i];

24.                    if (c == 0x02) {

25.                        if (i == 0 &&last == 0x01) {

26.                           //  buffer split the 0x01,0x02

27.                           ++line_count;

28.                        } else if(buffer[i-1] == 0x01) {

29.                           //  normal one

30.                           ++line_count;

31.                        }

32.                    }

33.                }

34.  

35.                //  keep the last one

36.                last = buffer[count-1];

37.             }

38.  

39.            System.out.println(line_count);

40.         } catch(FileNotFoundException e) {

41.            e.printStackTrace();

42.         } catch(IOException e) {

43.            e.printStackTrace();

44.         }

45.     }

46. }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值