hive多字符分割

最新推荐文章于 2024-05-17 10:40:23 发布

原创最新推荐文章于 2024-05-17 10:40:23 发布 · 570 阅读

CC 4.0 BY-SA版权

本文介绍了一种针对Hadoop MapReduce作业的自定义输入和输出格式实现方法。通过扩展Hadoop内置的TextInputFormat和TextOutputFormat，该实现允许用户自定义记录读取器和记录写入器的行为，从而更好地处理复杂的数据格式。此外，还提供了压缩支持以提高存储效率。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import java.io.IOException;
import java.io.InputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;

/**
* Treats keys as offset in file and value as line.
*/
public class CustomerRecordReader implements RecordReader<LongWritable, Text> {
private static final Log LOG
    = LogFactory.getLog(CustomerRecordReader.class.getName());

private CompressionCodecFactory compressionCodecs = null;
private long start;
private long pos;
private long end;
private LineReader in;
int maxLineLength;

/**
   * A class that provides a line reader from an input stream.
   * @deprecated Use {@link org.apache.hadoop.util.LineReader} instead.
   */
@Deprecated
public static class LineReader extends org.apache.hadoop.util.LineReader {
    LineReader(InputStream in) {
      super(in);
    }
    LineReader(InputStream in, int bufferSize) {
      super(in, bufferSize);
    }
    public LineReader(InputStream in, Configuration conf) throws IOException {
      super(in, conf);
    }
}

public CustomerRecordReader(Configuration job,
                          FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
                                    Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
      in = new LineReader(codec.createInputStream(fileIn), job);
      end = Long.MAX_VALUE;
    } else {
      if (start != 0) {
        skipFirstLine = true;
        --start;
        fileIn.seek(start);
      }
      in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
      start += in.readLine(new Text(), 0,
                           (int)Math.min((long)Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

public CustomerRecordReader(InputStream in, long offset, long endOffset,
                          int maxLineLength) {
    this.maxLineLength = maxLineLength;
    this.in = new LineReader(in);
    this.start = offset;
    this.pos = offset;
    this.end = endOffset;
}

public CustomerRecordReader(InputStream in, long offset, long endOffset,
                          Configuration job)
    throws IOException{
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
                                    Integer.MAX_VALUE);
    this.in = new LineReader(in, job);
    this.start = offset;
    this.pos = offset;
    this.end = endOffset;
}

public LongWritable createKey() {
    return new LongWritable();
}

public Text createValue() {
    return new Text();
}

/** Read a line. */
public synchronized boolean next(LongWritable key, Text value)
    throws IOException {

    while (pos < end) {
      key.set(pos);

      int newSize = in.readLine(value, maxLineLength,
                                Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),
                                         maxLineLength));

      // 替换默认的分割符
      String strReplace = value.toString().replaceAll("\\|\\|" , "\001" );
      Text txtReplace = new Text();
      txtReplace.set(strReplace );
      value.set(txtReplace.getBytes(), 0, txtReplace.getLength());


      if (newSize == 0) {
        return false;
      }
      pos += newSize;
      if (newSize < maxLineLength) {
        return true;
      }

      // line too long. try again
      LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    return false;
}

/**
   * Get the progress within the split
   */
public float getProgress() {
    if (start == end) {
      return 0.0f;
    } else {
      return Math.min(1.0f, (pos - start) / (float)(end - start));
    }
}

public synchronized long getPos() throws IOException {
    return pos;
}

public synchronized void close() throws IOException {
    if (in != null) {
      in.close();
    }
}

}

import java.io.DataOutputStream;
import java.io.IOException;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.ReflectionUtils;

public class CustomHiveOutputFormat extends
        HiveIgnoreKeyTextOutputFormat<WritableComparable, Writable> {

    @Override
    public org.apache.hadoop.mapred.RecordWriter<WritableComparable, Writable> getRecordWriter(
            FileSystem ignored, JobConf job, String name, Progressable progress)
            throws IOException {
        // TODO Auto-generated method stub
        return getRecordWriter(ignored, job, name, progress,"");
    }

    public org.apache.hadoop.mapred.RecordWriter getRecordWriter(
            FileSystem ignored, JobConf job, String name,
            Progressable progress, String s) throws IOException {
        boolean isCompressed = getCompressOutput(job);
        String keyValueSeparator = job.get("mapred.textoutputformat.separator",
                "\t");
        if (!isCompressed) {
            Path file = FileOutputFormat.getTaskOutputPath(job, name);
            FileSystem fs = file.getFileSystem(job);
            FSDataOutputStream fileOut = fs.create(file, progress);
            return new CustomRecordWriter(fileOut, keyValueSeparator);
        } else {
            Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(
                    job, GzipCodec.class);
            // create the named codec
            CompressionCodec codec = ReflectionUtils.newInstance(codecClass,
                    job);
            // build the filename including the extension
            Path file = FileOutputFormat.getTaskOutputPath(job,
                    name + codec.getDefaultExtension());
            FileSystem fs = file.getFileSystem(job);
            FSDataOutputStream fileOut = fs.create(file, progress);
            return new CustomRecordWriter(new DataOutputStream(
                    codec.createOutputStream(fileOut)), keyValueSeparator);
        }
    }

}

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

public class CustomInputFormat extends TextInputFormat {

@Override
public RecordReader<LongWritable, Text> getRecordReader(
        InputSplit genericSplit, JobConf job, Reporter reporter)
        throws IOException {

    return new CustomerRecordReader(job, (FileSplit) genericSplit);
}

}

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FSDataOutputStream;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.*;

public class CustomOutputFormat<K, V> extends FileOutputFormat<K, V> {

      protected static class LineRecordWriter<K, V>
        implements RecordWriter<K, V> {
        private static final String utf8 = "UTF-8";
        private static final byte[] newline;
        static {
          try {
            newline = "\n".getBytes(utf8);
          } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find " + utf8 + " encoding");
          }
        }

        protected DataOutputStream out;
        private final byte[] keyValueSeparator;

        public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
          this.out = out;
          try {
            this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
          } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find " + utf8 + " encoding");
          }
        }

        public LineRecordWriter(DataOutputStream out) {
          this(out, "\t");
        }

        /**
         * Write the object to the byte stream, handling Text as a special
         * case.
         * @param o the object to print
         * @throws IOException if the write throws, we pass it on
         */
        private void writeObject(Object o) throws IOException {
          if (o instanceof Text) {
            Text to = (Text) o;
            out.write(to.getBytes(), 0, to.getLength());
          } else {
            out.write(o.toString().getBytes(utf8));
          }
        }

        public synchronized void write(K key, V value)
          throws IOException {

          boolean nullKey = key == null || key instanceof NullWritable;
          boolean nullValue = value == null || value instanceof NullWritable;
          if (nullKey && nullValue) {
            return;
          }
          if (!nullKey) {
            writeObject(key);
          }
          if (!(nullKey || nullValue)) {
            out.write(keyValueSeparator);
          }
          if (!nullValue) {
            writeObject(value);
          }
          out.write(newline);
        }

        public synchronized void close(Reporter reporter) throws IOException {
          out.close();
        }
      }

      public RecordWriter<K, V> getRecordWriter(FileSystem ignored,
                                                      JobConf job,
                                                      String name,
                                                      Progressable progress)
        throws IOException {
        boolean isCompressed = getCompressOutput(job);
        String keyValueSeparator = job.get("mapred.textoutputformat.separator",
                                           "\t");
        if (!isCompressed) {
          Path file = FileOutputFormat.getTaskOutputPath(job, name);
          FileSystem fs = file.getFileSystem(job);
          FSDataOutputStream fileOut = fs.create(file, progress);
          return new CustomRecordWriter<K, V>(fileOut, keyValueSeparator);
        } else {
          Class<? extends CompressionCodec> codecClass =
            getOutputCompressorClass(job, GzipCodec.class);
          // create the named codec
          CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
          // build the filename including the extension
          Path file =
            FileOutputFormat.getTaskOutputPath(job,
                                               name + codec.getDefaultExtension());
          FileSystem fs = file.getFileSystem(job);
          FSDataOutputStream fileOut = fs.create(file, progress);
          return new CustomRecordWriter<K, V>(new DataOutputStream
                                            (codec.createOutputStream(fileOut)),
                                            keyValueSeparator);
        }
      }
}

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;

public class CustomRecordWriter<K, V> implements RecordWriter<K,V>{

    private static final String utf8 = "UTF-8";
    private static final byte[] newline;
    static {
      try {
        newline = "\n".getBytes(utf8);
      } catch (UnsupportedEncodingException uee) {
        throw new IllegalArgumentException("can't find " + utf8 + " encoding");
      }
    }

    protected DataOutputStream out;
    private final byte[] keyValueSeparator;

    public CustomRecordWriter(DataOutputStream out, String keyValueSeparator) {
      this.out = out;
      try {
        this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
      } catch (UnsupportedEncodingException uee) {
        throw new IllegalArgumentException("can't find " + utf8 + " encoding");
      }
    }

    public CustomRecordWriter(DataOutputStream out) {
      this(out, "\t");
    }

    /**
     * Write the object to the byte stream, handling Text as a special
     * case.
     * @param o the object to print
     * @throws IOException if the write throws, we pass it on
     */
    private void writeObject(Object o) throws IOException {
      if (o instanceof Text) {
        Text to = (Text) o;
        String strReplace = to.toString().replaceAll("\001" , "||" );
        to.set(strReplace);
        out.write(to.getBytes(), 0, to.getLength());
      } else {
        out.write(o.toString().getBytes(utf8));
      }
    }

    public synchronized void write(K key, V value)
      throws IOException {

      boolean nullKey = key == null || key instanceof NullWritable;
      boolean nullValue = value == null || value instanceof NullWritable;
      if (nullKey && nullValue) {
        return;
      }
      if (!nullKey) {
        writeObject(key);
      }
      if (!(nullKey || nullValue)) {
        out.write(keyValueSeparator);
      }
      if (!nullValue) {
        writeObject(value);
      }
      out.write(newline);
    }

    public synchronized void close(Reporter reporter) throws IOException {
      out.close();
    }

}