java实现csv格式转成GenericRecord格式的数据并且以文件的形式写到指定目录下

本文链接：https://blog.youkuaiyun.com/weixin_43865381/article/details/102728373

把byte[]写入文件的类的实现

package lm;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

public class FileMgr {
	
	private String toPath = null;
	private long n = 0;
	
	class Out {
		public FileOutputStream writer;
		public File file;
		public long count = 0;
	}
	
	private Out out = new Out();
	public FileMgr(String topic, String toPath) {
		this.toPath = toPath + File.separator + topic + "_";
		newFile();
	}
	
	public void close() throws IOException {
		if (out != null) {
			if (out.writer != null) {
				out.writer.close();
				out.writer = null;
			}
			out.count = 0;
			out.file = null;
		}
	}
	
	public void newFile() {
		try {
			if (out.file != null) {
				out.writer.close();
				out.file.renameTo(new File(out.file.getAbsolutePath() + ".ok"));
			}
			File f = new File(this.toPath + System.currentTimeMillis() + "_" + n++);
			out.file = f;
			out.writer = new FileOutputStream(f);
			out.count = 0;
			
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public void append(byte[] data) throws IOException {
		out.writer.write(data);
		out.writer.flush();
		newFile();
	}
	
}

csv转换成GenericRecord格式的数据的工具类

package lm;

import io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient;
import io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.Encoder;
import org.apache.avro.io.EncoderFactory;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import static org.apache.avro.Schema.Type.RECORD;

public class Csv2AvroUtil {
    private Schema schema = null;
    private String split = ",";
    private final int bufferSize = 5 * 1024 * 1024;
	private int batchSize = 10000;
	private String topic;
	

    /***
     * initialize
     * */
    public Csv2AvroUtil(String schemaUrl, String topic, String split){
        try {
        	this.topic = topic;
            Schema.Parser parser = new Schema.Parser();
            CachedSchemaRegistryClient cachedSchemaRegistryClient = new CachedSchemaRegistryClient(schemaUrl, 100);
            schema = parser.parse(cachedSchemaRegistryClient.getLatestSchemaMetadata(topic).getSchema());
            System.out.println("get schema successfully, schema: " + schema.toString());
            this.split = split;
        } catch( IOException e ) {
            System.out.println("get schema failed!");
            e.printStackTrace();
        } catch( RestClientException e ) {
            System.out.println("get schema failed!");
            e.printStackTrace();
        }
    }

    /***
     *convert csv to avro
     * */
    public byte[] convert(List<String> lines){
        ByteArrayOutputStream out = new ByteArrayOutputStream(bufferSize);
        GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
        Encoder encoder = EncoderFactory.get().binaryEncoder(out, null);

        try{
            for(String line: lines){
            	System.out.println("line:" + line);
                String[] vals = line.split(this.split);
                GenericRecord record = new GenericData.Record(this.schema);
                if(vals.length != schema.getFields().size()) {
                	continue;
                }
                for(int i = 0;i < schema.getFields().size();++i){
                    Object o = convert(schema.getFields().get(i).schema(), vals[i]);
                    record.put(i, o);
                }
                writer.write(record, encoder);
            }
            encoder.flush();
            out.flush();
        } catch( IOException e ) {
            e.printStackTrace();
        } catch( Exception e ) {
            e.printStackTrace();
        }

        return out.toByteArray();
    }

    private Object convert(Schema schema, String v) throws Exception {
        Object o = null;
        if (null == v) {
            return null;
        }

        if (v.length() == 0 || v.equals("")) {
            switch (schema.getType()) {
                case STRING:
                    return v;
                case BYTES:
                    return ByteBuffer.wrap(v.getBytes("UTF-8"));
                case UNION:
                    break;
                default:
                    return null;
            }
        }

        switch (schema.getType()) {
            case NULL:
                o = null;
                break;
            case BOOLEAN:
                o = Boolean.parseBoolean(v);
                break;
            case INT:
                o = Integer.parseInt(v);
                break;
            case LONG:
                o = Long.parseLong(v);
                break;
            case FLOAT:
                o = Float.parseFloat(v);
                break;
            case DOUBLE:
                o = Double.parseDouble(v);
                break;
            case BYTES:
                try {
                    o = ByteBuffer.wrap(v.getBytes("UTF-8"));
                } catch (UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
                break;
            case STRING:
                o = v;
                break;
            case UNION:
                for (Schema mem : schema.getTypes()) {
                    o = convert(mem, v);
                    // break when encounter not null value, or we will get null
                    if (o != null) break;
                }
                break;
            case RECORD:
                throw new Exception("Unsopported test.avro type:" + RECORD);
            case MAP:
                throw new Exception("Unsopported test.avro type:" + RECORD);
            case ENUM:throw new Exception("Unsopported test.avro type:" + RECORD);
            case ARRAY:
                throw new Exception("Unsopported test.avro type:" + RECORD);
            case FIXED:
                throw new Exception("Unsopported test.avro type:" + RECORD);
            default:
                throw new Exception("Unsopported test.avro type:" + RECORD);
        }
        return o;
    }
    
    public void readFile(String file, String toPath) {
    	try {
    		FileMgr mgr = new FileMgr(this.topic, toPath);
    		BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
        	String line;
        	List<String> lines = new ArrayList<String>(this.batchSize);
        	int size = 0;
    		while ((line = in.readLine()) != null) {
    			if(0 == size) lines.clear();
    			lines.add(line);
    			++size;
    			if(size >= batchSize){
    				byte[] data = this.convert(lines);
    				mgr.append(data);
    				lines.clear();
    				size = 0;
    			}
    		}
    		if(size>0 && !lines.isEmpty()){
    			byte[] data = this.convert(lines);
    			mgr.append(data);
    		}
    		in.close();
    		mgr.close();
    	} catch (Exception e) {
    		e.printStackTrace();
    	}
    }
    
    public static void main(String[] args){
    	if (args.length != 5) {
    		return;
    	}
    	String schema = args[0];
    	String topic = args[1];
    	String split = args[2];
    	String file = args[3];
    	String toPath = args[4];
    	Csv2AvroUtil c2a = new Csv2AvroUtil(schema, topic, split);
    	c2a.readFile(file, toPath);
    	
    }
}