iceberg系列之 hadoop catalog 小文件合并实战

本文档详细描述了如何在Flink1.15项目中使用Hadoop3.0,并展示了POM文件中的依赖管理,包括Flink核心组件、Iceberg库以及Hadoop相关模块的配置。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

  1. 背景
    flink1.15 hadoop3.0
  2. pom文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.iceberg</groupId>
    <artifactId>flink-iceberg</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <flink.version>1.15.3</flink.version>
        <java.version>1.8</java.version>
        <scala.binary.version>2.12</scala.binary.version>
        <slf4j.version>1.7.30</slf4j.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-files</artifactId>
            <version>${flink.version}</version>
        </dependency>


        <!--idea运行时也有webui-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-runtime-web</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-statebackend-rocksdb</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.1.3</version>
            <scope>compile</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-flink-runtime-1.15</artifactId>
            <version>1.3.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-core</artifactId>
            <version>1.3.0</version>
        </dependency>


    </dependencies>


    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.3.0</version>
                 <configuration>
                    <archive>
                        <manifest>
                            <!-- 指定主类 -->
                            <mainClass>com.iceberg.flink.UnionDelData</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>
  1. 资源配置文件
    hadoop三个常用配置文件core-site.xml hdfs-site.xml yarn-site.xml 放到资源目录下
  2. java代码
package com.iceberg.flink;

import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.Table;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.flink.actions.Actions;
import org.apache.iceberg.hadoop.HadoopCatalog;

import java.io.File;
import java.net.MalformedURLException;

public class UnionDelData {
    public static void main(String[] args) throws MalformedURLException {      
        String tableNames = args[1];
        long targetsSize = parseSizeToBytes(args[2]);
        int parallelism = Integer.parseInt(args[3]);
        long retainTime = parseTimeToMillis(args[4]);
        int retainLastNum = Integer.parseInt(args[5]);

        Configuration conf = new Configuration();
        conf.addResource(new File("/home/hadoop/hadoopconf/core-site.xml").toURI().toURL());
        conf.addResource(new File("/home/hadoop/hadoopconf/hdfs-site.xml").toURI().toURL());
        conf.addResource(new File("/home/hadoop/hadoopconf/yarn-site.xml").toURI().toURL());

        HadoopCatalog hadoopCatalog = new HadoopCatalog(conf, "/user/hadoop/path/");
        for (String tableName : tableNames.split(",")) {
            Table table = hadoopCatalog.loadTable(TableIdentifier.of("prod", tableName));
            UnionDataFile(table,parallelism,targetsSize);
            deleteSnap(table,retainTime,retainLastNum);
        }
    }

    public static void UnionDataFile(Table table,int parallelism,long targetsSize) {
        Actions.forTable(table)
                .rewriteDataFiles()
                .maxParallelism(parallelism)
                .caseSensitive(false)
                .targetSizeInBytes(targetsSize)
                .execute();
    }

    public static void deleteSnap(Table table,long retainTime,int retainLastNum){
        Snapshot snapshot = table.currentSnapshot();
        long oldSnapshot = snapshot.timestampMillis() - retainTime;
        if (snapshot != null) {            
			        table.expireSnapshots()
			        .expireOlderThan(oldSnapshot)
			        .cleanExpiredFiles(true)
			        .retainLast(retainLastNum)
			        .commit();
	        }
    }

    public static long parseSizeToBytes(String sizeWithUnit) {
        long size = Long.parseLong(sizeWithUnit.substring(0, sizeWithUnit.length() - 1));
        char unit = sizeWithUnit.charAt(sizeWithUnit.length() - 1); 
        switch (unit) {
            case 'B':
                return size;
            case 'K':
            case 'k': 
                return size * 1024;
            case 'M':
            case 'm': 
                return size * 1024 * 1024;
            case 'G':
            case 'g': 
                return size * 1024 * 1024 * 1024;
            default:
                throw new IllegalArgumentException("Invalid size unit: " + unit);
        }
    }

    public static long parseTimeToMillis(String timeWithUnit) {
        long time = Long.parseLong(timeWithUnit.substring(0, timeWithUnit.length() - 1));
        char unit = timeWithUnit.charAt(timeWithUnit.length() - 1);

        switch (unit) {
            case 's':
            case 'S':
                return time * 1000;
            case 'm':
            case 'M':
                return time * 60 * 1000;
            case 'h':
            case 'H':
                return time * 60 * 60 * 1000;
            case 'd':
            case 'D':
                return time * 24 * 60 * 60 * 1000;
            default:
                throw new IllegalArgumentException("Invalid time unit: " + unit);
        }
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

BigDataMLApplication

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值