记录中的最大行max_row

本文介绍了一个自定义Hive UDAF (User Defined Aggregation Function),名为 MaxRow,该函数能够根据指定列返回最大值及其关联列组成的结构。通过实现 GenericUDAFMaxRow 类并使用 Apache Commons Logging 和 Hadoop 相关类库,本文详细阐述了 MaxRow 的工作原理及其实现细节。
部署运行你感兴趣的模型镜像
import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryStruct;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;

@Description(name = "maxrow", value = "_FUNC_(expr) - Returns the maximum value of expr and values of associated columns as a struct")
public class GenericUDAFMaxRow extends AbstractGenericUDAFResolver {

  static final Log LOG = LogFactory.getLog(GenericUDAFMaxRow.class.getName());

  @Override
  public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
    // Verify that the first parameter supports comparisons.
    ObjectInspector oi = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(parameters[0]);
    if (!ObjectInspectorUtils.compareSupported(oi)) {
      throw new UDFArgumentTypeException(0, "Cannot support comparison of map<> type or complex type containing map<>.");
    }
    return new GenericUDAFMaxRowEvaluator();
  }

  // @UDFType(distinctLike=true)
  public static class GenericUDAFMaxRowEvaluator extends GenericUDAFEvaluator {

    ObjectInspector[] inputOIs;
    ObjectInspector[] outputOIs;
    ObjectInspector structOI;

    @Override
    public ObjectInspector init(Mode mode, ObjectInspector[] parameters) throws HiveException {
      super.init(mode, parameters);

      int length = parameters.length;
      if (length > 1 || !(parameters[0] instanceof StructObjectInspector)) {
        assert(mode == Mode.COMPLETE || mode == Mode.FINAL);
        initMapSide(parameters);

      } else {
        assert(mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2);
        assert(parameters.length == 1 && parameters[0] instanceof StructObjectInspector);
        initReduceSide((StructObjectInspector) parameters[0]);
      }

      return structOI;
    }

    /* Initialize the UDAF on the map side. */
    private void initMapSide(ObjectInspector[] parameters) throws HiveException {
      int length = parameters.length;
      outputOIs = new ObjectInspector[length];
      List<String> fieldNames = new ArrayList<String>(length);
      List<ObjectInspector> fieldOIs = Arrays.asList(outputOIs);

      for (int i = 0; i < length; i++) {
        fieldNames.add("col" + i); // field names are not made available! :(
        outputOIs[i] = ObjectInspectorUtils.getStandardObjectInspector(parameters[i]);
      }

      inputOIs = parameters;
      structOI = ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
    }

    /* Initialize the UDAF on the reduce side (or the map side in some cases). */
    private void initReduceSide(StructObjectInspector inputStructOI) throws HiveException {
      List<? extends StructField> fields = inputStructOI.getAllStructFieldRefs();
      int length = fields.size();
      inputOIs = new ObjectInspector[length];
      outputOIs = new ObjectInspector[length];
      for (int i = 0; i < length; i++) {
        StructField field = fields.get(i);
        inputOIs[i] = field.getFieldObjectInspector();
        outputOIs[i] = ObjectInspectorUtils.getStandardObjectInspector(inputOIs[i]);
      }
      structOI = ObjectInspectorUtils.getStandardObjectInspector(inputStructOI);
    }

    static class MaxAgg implements AggregationBuffer {
      Object[] objects;
    }

    @Override
    public AggregationBuffer getNewAggregationBuffer() throws HiveException {
      MaxAgg result = new MaxAgg();
      return result;
    }

    @Override
    public void reset(AggregationBuffer agg) throws HiveException {
      MaxAgg maxagg = (MaxAgg) agg;
      maxagg.objects = null;
    }

    @Override
    public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
      merge(agg, parameters);
    }

    @Override
    public Object terminatePartial(AggregationBuffer agg) throws HiveException {
      return terminate(agg);
    }

    @Override
    public void merge(AggregationBuffer agg, Object partial) throws HiveException {
      if (partial != null) {
        MaxAgg maxagg = (MaxAgg) agg;
        List<Object> objects;
        if (partial instanceof Object[]) {
          objects = Arrays.asList((Object[]) partial);
        } else if (partial instanceof LazyBinaryStruct) {
          objects = ((LazyBinaryStruct) partial).getFieldsAsList();
        } else {
          throw new HiveException("Invalid type: " + partial.getClass().getName());
        }

        boolean isMax = false;
        if (maxagg.objects == null) {
          isMax = true;
        } else {
          int cmp = ObjectInspectorUtils.compare(maxagg.objects[0], outputOIs[0], objects.get(0), inputOIs[0]);
          if (cmp < 0) {
            isMax = true;
          }
        }

        if (isMax) {
          int length = objects.size();
          maxagg.objects = new Object[length];
          for (int i = 0; i < length; i++) {
            maxagg.objects[i] = ObjectInspectorUtils.copyToStandardObject(objects.get(i), inputOIs[i]);
          }
        }
      }
    }

    @Override
    public Object terminate(AggregationBuffer agg) throws HiveException {
      MaxAgg maxagg = (MaxAgg) agg;
      return Arrays.asList(maxagg.objects);
    }
  }
}

 maxrow(compare_col,col1,col2,col3....)根据输入的compare列进行比较,返回最大行,包含值compare-col,col1,col2...返回结构是struct,需要根据struct结构取值

您可能感兴趣的与本文相关的镜像

PyTorch 2.5

PyTorch 2.5

PyTorch
Cuda

PyTorch 是一个开源的 Python 机器学习库,基于 Torch 库,底层由 C++ 实现,应用于人工智能领域,如计算机视觉和自然语言处理

帮我优化以下代码:import os import openpyxl from openpyxl.styles import PatternFill, Border, Side, Alignment, Font # 常量定义 COLOR_FILLS = [ (lambda x: 0.9999 <= x < 1.0, PatternFill(start_color='FCE4D6', fill_type='solid')), (lambda x: 0.9994 <= x < 0.9999, PatternFill(start_color='F4C9AC', fill_type='solid')), (lambda x: 0.999 <= x < 0.9994, PatternFill(start_color='F0AE84', fill_type='solid')), (lambda x: 0.99 <= x < 0.999, PatternFill(start_color='C65911', fill_type='solid')), (lambda x: 0.9 <= x < 0.99, PatternFill(start_color='833C0C', fill_type='solid')), (lambda x: x < 0.9, PatternFill(start_color='FF0000', fill_type='solid')), ] # 定义边框样式 THIN_BORDER = Border( left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin') ) # 定义居中对齐样式 CENTER_ALIGNMENT = Alignment(horizontal='center', vertical='center') # 定义字体 FONT = Font(name='Times New Roman') # 定义数据样式, 保留2位小数 PERCENTAGE_STYLE = '0.00%' DECIMALS_STYLE = '0.00' def apply_cell_styles(sheet): """应用单元格样式(边框和居中)""" for row in sheet.iter_rows(): for cell in row: cell.alignment = CENTER_ALIGNMENT cell.border = THIN_BORDER cell.font = FONT def apply_conditional_formatting(sheet): """应用条件格式""" max_row = sheet.max_row max_col = sheet.max_column for row in sheet.iter_rows(min_row=2, max_row=max_row, min_col=8, max_col=max_col - 1): for cell in row: if not isinstance(cell.value, (int, float)): continue # for condition, fill in COLOR_FILLS: # if condition(cell.value): # cell.fill = fill # break def apply_numerical_format(sheet, start_row, start_col): """将Rpost(ohm,Yield)列设置为百分数格式""" for row in range(start_row, sheet.max_row + 1): for col in range(start_col, sheet.max_column + 1): cell = sheet.cell(row=row, column=col) if isinstance(cell.value, (int, float)): cell.number_format = PERCENTAGE_STYLE def find_non_empty_start_cell(sheet, cell, axis=0): first_cell = None if axis == 0: # 获取号 start_row = cell.row # 从cell开始遍历所有列,找到第一个有数据的列 for col in range(start_row, sheet.max_column + 1): cell = sheet.cell(row=start_row, column=col) if cell.value is not None and str(cell.value).strip() != "": first_cell = col break if first_cell is None: first_cell = sheet.max_column if axis == 1: # 获取列号 start_col = cell.column # 从cell列开始遍历所有,找到第一个有数据的 for row in range(start_col, sheet.max_row + 1): cell = sheet.cell(row=row, column=start_col) if cell.value is not None and str(cell.value).strip() != "": first_cell = row break if first_cell is None: first_cell = sheet.max_column return first_cell def delete_rows_preserve_merged_cells(sheet, rows_to_delete): """ 删除指定同时保持合并单元格完整 参数: file_path: Excel文件路径 sheet_name: 工作表名称 rows_to_delete: 要删除的号列表(从1开始) """ # 1. 记录所有合并区域 merged_ranges = [] for merged_range in sheet.merged_cells.ranges: merged_ranges.append({ 'range_str': str(merged_range), 'min_row': merged_range.min_row, 'max_row': merged_range.max_row, 'min_col': merged_range.min_col, 'max_col': merged_range.max_col }) # 2. 解除所有合并单元格 for mr in merged_ranges: sheet.unmerge_cells(mr['range_str']) # 3. 从大到小排序要删除的(避免号变化问题) rows_to_delete.sort(reverse=True) # 4. 删除指定 for row_idx in rows_to_delete: sheet.delete_rows(row_idx) # 5. 重新应用合并单元格(根据号变化调整位置) for mr in merged_ranges: # 计算需要调整的号 deleted_above = sum(1 for r in rows_to_delete if r < mr['min_row']) # 调整合并区域的号 new_min_row = mr['min_row'] - deleted_above new_max_row = mr['max_row'] - deleted_above # 重新合并 sheet.merge_cells( start_row=new_min_row, start_column=mr['min_col'], end_row=new_max_row, end_column=mr['max_col'] ) def process_workbook(file_path): """处理单个工作簿""" wb = openpyxl.load_workbook(file_path) if "Yield_Result" not in wb.sheetnames: return # 创建工作表副本 yield_sheet = wb["Yield_Result"] new_sheet_name = "Yield_Result_format" if new_sheet_name in wb.sheetnames: del wb[new_sheet_name] new_sheet = wb.copy_worksheet(yield_sheet) new_sheet.title = new_sheet_name # 找到合并列 end_row = find_non_empty_start_cell(new_sheet, new_sheet["A1"]) end_column = find_non_empty_start_cell(new_sheet, new_sheet["A1"], axis=1) # 将找到的第一数据复制到所需单元格所在的 start_row = new_sheet["A1"].row for col in range(new_sheet["A1"].column, end_column + 1): value = new_sheet.cell(row=end_row, column=col).value # 合并单元格 new_sheet.merge_cells(start_row=start_row, end_row=end_row-1, start_column=col, end_column=col) new_sheet.cell(row=start_row, column=col).value = value # 删除找到的第一数据 delete_rows_preserve_merged_cells(new_sheet, [end_row]) # 应用格式 apply_cell_styles(new_sheet) apply_conditional_formatting(new_sheet) apply_numerical_format(new_sheet, end_row, end_column) wb.save(file_path) wb.close() def main(DATA_DIR): if os.path.isfile(DATA_DIR): process_workbook(DATA_DIR) else: files = [ f for f in os.listdir(DATA_DIR) if 'yield' in f and f.endswith(".xlsx") and not f.startswith('$') ] for file in files: process_workbook(os.path.join(DATA_DIR, file)) if __name__ == "__main__": DATA_DIR = r'D:\数据处理\10.55EF\SRAMROM\C170A\CL0A40\新建文件夹\Data_Extraction\FunctionYield' main(DATA_DIR)
最新发布
11-21
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值