如何使用api操作HDFS

最新推荐文章于 2022-12-07 11:42:13 发布

mm_ren

最新推荐文章于 2022-12-07 11:42:13 发布

阅读量249

点赞数 1

文章标签： java 大数据 hadoop hdfs maven

本文链接：https://blog.youkuaiyun.com/mm_ren/article/details/117794974

版权

使用HDFS API之前，需要在我们项目的pom.xml文件添加关于hadoop的依赖。

  1<?xml version="1.0" encoding="UTF-8"?>
  2
  3<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5  <modelVersion>4.0.0</modelVersion>
  6
  7  <groupId>com.ruozedata.bigdata</groupId>
  8  <artifactId>ruoedata-hadoop</artifactId>
  9  <version>1.0</version>
 10
 11  <name>ruoedata-hadoop</name>
 12  <!-- FIXME change it to the project's website -->
 13  <url>http://www.example.com</url>
 14
 15  <properties>
 16    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 17    <maven.compiler.source>1.8</maven.compiler.source>
 18    <maven.compiler.target>1.8</maven.compiler.target>
 19    <hadoop.version>2.6.0-cdh5.15.1</hadoop.version>
 20    <zk.version>3.4.5-cdh5.15.1</zk.version>
 21    <scala.version>2.11.8</scala.version>
 22    <curator.version>4.0.0</curator.version>
 23  </properties>
 24
 25//配置cdh下载地址
 26  <repositories>
 27    <repository>
 28      <id>cloudera</id>
 29      <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
 30    </repository>
 31  </repositories>
 32
 33//配置hadoop下载的版本
 34  <dependencies>
 35    <dependency>
 36      <groupId>org.apache.hadoop</groupId>
 37      <artifactId>hadoop-client</artifactId>
 38      <version>2.6.0-cdh5.15.1</version>
 39    </dependency>
 40
 41    <dependency>
 42      <groupId>org.scala-lang</groupId>
 43      <artifactId>scala-library</artifactId>
 44      <version>${scala.version}</version>
 45    </dependency>
 46
 47    <dependency>
 48      <groupId>junit</groupId>
 49      <artifactId>junit</artifactId>
 50      <version>4.11</version>
 51      <scope>test</scope>
 52    </dependency>
 53
 54  </dependencies>
 55
 56  <build>
 57    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
 58      <plugins>
 59        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
 60        <plugin>
 61          <artifactId>maven-clean-plugin</artifactId>
 62          <version>3.1.0</version>
 63        </plugin>
 64        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
 65        <plugin>
 66          <artifactId>maven-resources-plugin</artifactId>
 67          <version>3.0.2</version>
 68        </plugin>
 69        <plugin>
 70          <artifactId>maven-compiler-plugin</artifactId>
 71          <version>3.8.0</version>
 72          <!--<configuration>-->
 73            <!--<verbose>true</verbose>-->
 74            <!--<fork>true</fork>-->
 75            <!--<executable>C:\Program Files\Java\jdk1.8.0_131\bin\javac</executable>-->
 76          <!--</configuration>-->
 77        </plugin>
 78        <plugin>
 79          <artifactId>maven-surefire-plugin</artifactId>
 80          <version>2.22.1</version>
 81        </plugin>
 82        <plugin>
 83          <artifactId>maven-jar-plugin</artifactId>
 84          <version>3.0.2</version>
 85        </plugin>
 86        <plugin>
 87          <artifactId>maven-install-plugin</artifactId>
 88          <version>2.5.2</version>
 89        </plugin>
 90        <plugin>
 91          <artifactId>maven-deploy-plugin</artifactId>
 92          <version>2.8.2</version>
 93        </plugin>
 94        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
 95        <plugin>
 96          <artifactId>maven-site-plugin</artifactId>
 97          <version>3.7.1</version>
 98        </plugin>
 99        <plugin>
100          <artifactId>maven-project-info-reports-plugin</artifactId>
101          <version>3.0.0</version>
102        </plugin>
103      </plugins>
104    </pluginManagement>
105  </build>
106</project>

我们可以在IEDA的测试单元里面测试关于HDFS API的使用

  1package com.ruozedata.bigdata.hadoop.hdfs;
  2
  3import org.apache.hadoop.conf.Configuration;
  4import org.apache.hadoop.fs.*;
  5import org.apache.hadoop.io.IOUtils;
  6import org.junit.After;
  7import org.junit.Assert;
  8import org.junit.Before;
  9import org.junit.Test;
 10
 11import java.io.*;
 12import java.net.URI;
 13
 14public class HDFSApp {
 15//定义我们的云主机的连接地址
 16    public static final String HDFS_PATH ="hdfs://hadoop001:9000/";
 17
 18    Configuration configuration;
 19    FileSystem fileSystem;
 20    @Before
 21    public void setUp() throws Exception{
 22        configuration = new Configuration();
 23 //指定配置文件的副本数目，默认为3
 24        configuration.set("dfs.replication","1");
 25 //FileSystem是我们HDFS API的唯一入口，必须创建一个文件系统，才能开始对HDFS的操作，
 26 //里面的参数介绍（指定连接的地址；创建一个configuration;指定创建的用户，默认使用本地计算机的用户）
 27        fileSystem = FileSystem.get(new URI(HDFS_PATH), configuration,"hadoop");
 28    }
 29//使用完文件系统之后要记得关闭
 30    @After
 31    public void tearDown() throws IOException {
 32        fileSystem.close();
 33    }
 34//创建文件夹
 35    @Test
 36    public void mkdir() throws Exception {
 37        boolean isSuccess = fileSystem.mkdirs(new Path("/ruozedata/hdfsapi"));
 38        Assert.assertEquals(true,isSuccess);
 39    }
 40//拷贝文件从本地电脑到云主机上    
 41    @Test
 42    public void cpFromLocal() throws Exception{
 43
 44        Path srcPath = new Path("d:/text.log");
 45        Path destPath = new Path("/ruozedata");
 46        fileSystem.copyFromLocalFile(srcPath,destPath);
 47    }
 48//可以通过get获取我们的默认HDFS参数设置
 49    @Test
 50    public  void testReplication(){
 51        System.out.printf(configuration.get("dfs.replication"));
 52
 53    }
 54//从云主机拷贝文件到本地
 55    @Test
 56    public void copyToLocal() throws Exception{
 57        Path srcPath = new Path("1.log");
 58        Path destPath = new Path("d:/1.log");
 59        fileSystem.copyToLocalFile(srcPath,destPath);
 60    }
 61//将一个指定目录下的所有文件放到另一个文件夹下
 62    String time="20191001";
 63    String ruozedata="d:/";
 64    int i = 1;
 65    @Test
 66    public  void rename() throws IOException{
 67
 68//substring(int beginIndex, int endIndex)。
 69//第一个int为开始的索引，对应String数字中的开始位置，
 70//第二个是截止的索引位置，对应String中的结束位置 
 71
 72        String newDir = time.substring(2);
 73        fileSystem.mkdirs(new Path(ruozedata+newDir));
 74        RemoteIterator<LocatedFileStatus> listFiles = fileSystem.listFiles(new Path(ruozedata+time),true);
 75
 76        while(listFiles.hasNext()){
 77            LocatedFileStatus fileStatus = listFiles.next();
 78            Path srcpath = fileStatus.getPath();
 79            String descPath=ruozedata+newDir+"/"+i+"-"+time+".txt";
 80            Path destpath = new Path(descPath);
 81            fileSystem.rename(srcpath,destpath);
 82            i++;
 83        }
 84    }
 85//重命名，将原来的文件名命名成一个新的文件名
 86    @Test
 87    public void rename02() throws IOException {
 88        Path srcpath = new Path("/ruozedata/access.log");
 89        Path descpath = new Path("/ruozedata/access-2.log");
 90        fileSystem.rename(srcpath,descpath);
 91    }
 92    //查询一个目录下的文件信息
 93    @Test
 94    public void listFile() throws Exception{
 95//指定要查询的目录
 96        RemoteIterator<LocatedFileStatus> files = fileSystem.listFiles(new Path("D:\\BaiduYunDownload\\高级班入门\\gitlab\\G7\\01-开班"), true);
 97//hasNext()方法判断输入（文件、字符串、键盘等输入流）是否还有下一个输入项，若有，返回true，反之false。
 98        while(files.hasNext()) {
 99//next()方法的作用：有记录（有值）返回true并把记录内容存入到对应的对象中
100            LocatedFileStatus fileStatus = files.next();
101//三目运算符：boolean ? 表达式1 ：表达式2；
102//如果boolean 为true,则整个结果为表达式1 的结果
103//如果boolean为false,则整个结果为表达式2 的结果
104            String isDir = fileStatus.isDirectory() ? "文件夹" : "文件";
105  //获取文件的权限信息
106            String permission = fileStatus.getPermission().toString();
107            short replication = fileStatus.getReplication();
108            long length = fileStatus.getLen();
109            String path = fileStatus.getPath().toString();
110            System.out.printf(isDir+ "\t"
111                    +permission+ "\t"
112                    +replication+ "\t"
113                    +length+ "\t"
114                    +path+"\n");
115//列出文件所对应的块是存储在哪台机器上
116            BlockLocation[] blockLocations = fileStatus.getBlockLocations();
117            for (BlockLocation location:blockLocations){
118                String[] hosts = location.getHosts();
119                for(String host:hosts){
120                    System.out.printf(host);
121                }
122            }
123        }
124    }
125
126    //TODO... 通过流的方式拷贝文件
127
128    @Test
129    public void copyFromLocalIO() throws Exception{
130        BufferedInputStream in = new BufferedInputStream(new FileInputStream(new File("d:/text.log")));
131        FSDataOutputStream out = fileSystem.create(new Path("/ruozedata/access-io.log"));
132        IOUtils.copyBytes(in,out,4096);
133        IOUtils.closeStream(out);
134        IOUtils.closeStream(in);
135    }
136
137    @Test
138    public void down01() throws Exception{
139        FSDataInputStream in = fileSystem.open(new Path("/ruozedata/test.tar.gz"));
140        FileOutputStream out = new FileOutputStream(new File("d:/spark.tgz.part0"));
141
142        //跳过多少字节
143        in.seek(1024*1024*128);
144
145        byte[] buffer = new byte[1024];
146        for (int i=0;i<1024*128;i++){
147            in.read(buffer);
148            out.write(buffer);
149        }
150//        IOUtils.copyBytes(in,out,configuration); 可以读取剩余的文件
151        IOUtils.closeStream(out);
152        IOUtils.closeStream(in);
153    }
154}

识别 下方二 维码，即可关注公众号获取最新大厂技术