使用HDFS API之前,需要在我们项目的pom.xml文件添加关于hadoop的依赖。
1<?xml version="1.0" encoding="UTF-8"?>
2
3<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5 <modelVersion>4.0.0</modelVersion>
6
7 <groupId>com.ruozedata.bigdata</groupId>
8 <artifactId>ruoedata-hadoop</artifactId>
9 <version>1.0</version>
10
11 <name>ruoedata-hadoop</name>
12 <!-- FIXME change it to the project's website -->
13 <url>http://www.example.com</url>
14
15 <properties>
16 <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
17 <maven.compiler.source>1.8</maven.compiler.source>
18 <maven.compiler.target>1.8</maven.compiler.target>
19 <hadoop.version>2.6.0-cdh5.15.1</hadoop.version>
20 <zk.version>3.4.5-cdh5.15.1</zk.version>
21 <scala.version>2.11.8</scala.version>
22 <curator.version>4.0.0</curator.version>
23 </properties>
24
25//配置cdh下载地址
26 <repositories>
27 <repository>
28 <id>cloudera</id>
29 <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
30 </repository>
31 </repositories>
32
33//配置hadoop下载的版本
34 <dependencies>
35 <dependency>
36 <groupId>org.apache.hadoop</groupId>
37 <artifactId>hadoop-client</artifactId>
38 <version>2.6.0-cdh5.15.1</version>
39 </dependency>
40
41 <dependency>
42 <groupId>org.scala-lang</groupId>
43 <artifactId>scala-library</artifactId>
44 <version>${scala.version}</version>
45 </dependency>
46
47 <dependency>
48 <groupId>junit</groupId>
49 <artifactId>junit</artifactId>
50 <version>4.11</version>
51 <scope>test</scope>
52 </dependency>
53
54 </dependencies>
55
56 <build>
57 <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
58 <plugins>
59 <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
60 <plugin>
61 <artifactId>maven-clean-plugin</artifactId>
62 <version>3.1.0</version>
63 </plugin>
64 <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
65 <plugin>
66 <artifactId>maven-resources-plugin</artifactId>
67 <version>3.0.2</version>
68 </plugin>
69 <plugin>
70 <artifactId>maven-compiler-plugin</artifactId>
71 <version>3.8.0</version>
72 <!--<configuration>-->
73 <!--<verbose>true</verbose>-->
74 <!--<fork>true</fork>-->
75 <!--<executable>C:\Program Files\Java\jdk1.8.0_131\bin\javac</executable>-->
76 <!--</configuration>-->
77 </plugin>
78 <plugin>
79 <artifactId>maven-surefire-plugin</artifactId>
80 <version>2.22.1</version>
81 </plugin>
82 <plugin>
83 <artifactId>maven-jar-plugin</artifactId>
84 <version>3.0.2</version>
85 </plugin>
86 <plugin>
87 <artifactId>maven-install-plugin</artifactId>
88 <version>2.5.2</version>
89 </plugin>
90 <plugin>
91 <artifactId>maven-deploy-plugin</artifactId>
92 <version>2.8.2</version>
93 </plugin>
94 <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
95 <plugin>
96 <artifactId>maven-site-plugin</artifactId>
97 <version>3.7.1</version>
98 </plugin>
99 <plugin>
100 <artifactId>maven-project-info-reports-plugin</artifactId>
101 <version>3.0.0</version>
102 </plugin>
103 </plugins>
104 </pluginManagement>
105 </build>
106</project>
我们可以在IEDA的测试单元里面测试关于HDFS API的使用
1package com.ruozedata.bigdata.hadoop.hdfs;
2
3import org.apache.hadoop.conf.Configuration;
4import org.apache.hadoop.fs.*;
5import org.apache.hadoop.io.IOUtils;
6import org.junit.After;
7import org.junit.Assert;
8import org.junit.Before;
9import org.junit.Test;
10
11import java.io.*;
12import java.net.URI;
13
14public class HDFSApp {
15//定义我们的云主机的连接地址
16 public static final String HDFS_PATH ="hdfs://hadoop001:9000/";
17
18 Configuration configuration;
19 FileSystem fileSystem;
20 @Before
21 public void setUp() throws Exception{
22 configuration = new Configuration();
23 //指定配置文件的副本数目,默认为3
24 configuration.set("dfs.replication","1");
25 //FileSystem是我们HDFS API的唯一入口,必须创建一个文件系统,才能开始对HDFS的操作,
26 //里面的参数介绍(指定连接的地址;创建一个configuration;指定创建的用户,默认使用本地计算机的用户)
27 fileSystem = FileSystem.get(new URI(HDFS_PATH), configuration,"hadoop");
28 }
29//使用完文件系统之后要记得关闭
30 @After
31 public void tearDown() throws IOException {
32 fileSystem.close();
33 }
34//创建文件夹
35 @Test
36 public void mkdir() throws Exception {
37 boolean isSuccess = fileSystem.mkdirs(new Path("/ruozedata/hdfsapi"));
38 Assert.assertEquals(true,isSuccess);
39 }
40//拷贝文件从本地电脑到云主机上
41 @Test
42 public void cpFromLocal() throws Exception{
43
44 Path srcPath = new Path("d:/text.log");
45 Path destPath = new Path("/ruozedata");
46 fileSystem.copyFromLocalFile(srcPath,destPath);
47 }
48//可以通过get获取我们的默认HDFS参数设置
49 @Test
50 public void testReplication(){
51 System.out.printf(configuration.get("dfs.replication"));
52
53 }
54//从云主机拷贝文件到本地
55 @Test
56 public void copyToLocal() throws Exception{
57 Path srcPath = new Path("1.log");
58 Path destPath = new Path("d:/1.log");
59 fileSystem.copyToLocalFile(srcPath,destPath);
60 }
61//将一个指定目录下的所有文件放到另一个文件夹下
62 String time="20191001";
63 String ruozedata="d:/";
64 int i = 1;
65 @Test
66 public void rename() throws IOException{
67
68//substring(int beginIndex, int endIndex)。
69//第一个int为开始的索引,对应String数字中的开始位置,
70//第二个是截止的索引位置,对应String中的结束位置
71
72 String newDir = time.substring(2);
73 fileSystem.mkdirs(new Path(ruozedata+newDir));
74 RemoteIterator<LocatedFileStatus> listFiles = fileSystem.listFiles(new Path(ruozedata+time),true);
75
76 while(listFiles.hasNext()){
77 LocatedFileStatus fileStatus = listFiles.next();
78 Path srcpath = fileStatus.getPath();
79 String descPath=ruozedata+newDir+"/"+i+"-"+time+".txt";
80 Path destpath = new Path(descPath);
81 fileSystem.rename(srcpath,destpath);
82 i++;
83 }
84 }
85//重命名,将原来的文件名命名成一个新的文件名
86 @Test
87 public void rename02() throws IOException {
88 Path srcpath = new Path("/ruozedata/access.log");
89 Path descpath = new Path("/ruozedata/access-2.log");
90 fileSystem.rename(srcpath,descpath);
91 }
92 //查询一个目录下的文件信息
93 @Test
94 public void listFile() throws Exception{
95//指定要查询的目录
96 RemoteIterator<LocatedFileStatus> files = fileSystem.listFiles(new Path("D:\\BaiduYunDownload\\高级班入门\\gitlab\\G7\\01-开班"), true);
97//hasNext()方法判断输入(文件、字符串、键盘等输入流)是否还有下一个输入项,若有,返回true,反之false。
98 while(files.hasNext()) {
99//next()方法的作用:有记录(有值)返回true并把记录内容存入到对应的对象中
100 LocatedFileStatus fileStatus = files.next();
101//三目运算符:boolean ? 表达式1 :表达式2;
102//如果boolean 为true,则整个结果为表达式1 的结果
103//如果boolean为false,则整个结果为表达式2 的结果
104 String isDir = fileStatus.isDirectory() ? "文件夹" : "文件";
105 //获取文件的权限信息
106 String permission = fileStatus.getPermission().toString();
107 short replication = fileStatus.getReplication();
108 long length = fileStatus.getLen();
109 String path = fileStatus.getPath().toString();
110 System.out.printf(isDir+ "\t"
111 +permission+ "\t"
112 +replication+ "\t"
113 +length+ "\t"
114 +path+"\n");
115//列出文件所对应的块是存储在哪台机器上
116 BlockLocation[] blockLocations = fileStatus.getBlockLocations();
117 for (BlockLocation location:blockLocations){
118 String[] hosts = location.getHosts();
119 for(String host:hosts){
120 System.out.printf(host);
121 }
122 }
123 }
124 }
125
126 //TODO... 通过流的方式拷贝文件
127
128 @Test
129 public void copyFromLocalIO() throws Exception{
130 BufferedInputStream in = new BufferedInputStream(new FileInputStream(new File("d:/text.log")));
131 FSDataOutputStream out = fileSystem.create(new Path("/ruozedata/access-io.log"));
132 IOUtils.copyBytes(in,out,4096);
133 IOUtils.closeStream(out);
134 IOUtils.closeStream(in);
135 }
136
137 @Test
138 public void down01() throws Exception{
139 FSDataInputStream in = fileSystem.open(new Path("/ruozedata/test.tar.gz"));
140 FileOutputStream out = new FileOutputStream(new File("d:/spark.tgz.part0"));
141
142 //跳过多少字节
143 in.seek(1024*1024*128);
144
145 byte[] buffer = new byte[1024];
146 for (int i=0;i<1024*128;i++){
147 in.read(buffer);
148 out.write(buffer);
149 }
150// IOUtils.copyBytes(in,out,configuration); 可以读取剩余的文件
151 IOUtils.closeStream(out);
152 IOUtils.closeStream(in);
153 }
154}
识别
下方二
维码
,即可关注公众号获取最新大厂技术
