hive UDAF源代码分析

sss

/**

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

package org.apache.hadoop.hive.ql.udf.generic;

import java.util.HashSet;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.apache.hadoop.hive.common.type.HiveDecimal;

import org.apache.hadoop.hive.ql.exec.Description;

import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;

import org.apache.hadoop.hive.ql.metadata.HiveException;

import org.apache.hadoop.hive.ql.parse.SemanticException;

import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef;

import org.apache.hadoop.hive.ql.util.JavaDataModel;

import org.apache.hadoop.hive.serde2.io.DoubleWritable;

import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;

import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorObject;

import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;

import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;

import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils;

import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;

import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;

import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.util.StringUtils;

/**

 * GenericUDAFSum.

 *

 */

@Description(name = "sum", value = "_FUNC_(x) - Returns the sum of a set of numbers")

public class GenericUDAFSum extends AbstractGenericUDAFResolver {

  static final Logger LOG = LoggerFactory.getLogger(GenericUDAFSum.class.getName());

  @Override

  public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)

      throws SemanticException {

    if (parameters.length != 1) {

      throw new UDFArgumentTypeException(parameters.length - 1,

          "Exactly one argument is expected.");

    }

    if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {

      throw new UDFArgumentTypeException(0,

          "Only primitive type arguments are accepted but "

              + parameters[0].getTypeName() + " is passed.");

    }

    switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {

    case BYTE:

    case SHORT:

    case INT:

    case LONG:

      return new GenericUDAFSumLong();

    case TIMESTAMP:

    case FLOAT:

    case DOUBLE:

    case STRING:

    case VARCHAR:

    case CHAR:

      return new GenericUDAFSumDouble();

    case DECIMAL:

      return new GenericUDAFSumHiveDecimal();

    case BOOLEAN:

    case DATE:

    default:

      throw new UDFArgumentTypeException(0,

          "Only numeric or string type arguments are accepted but "

              + parameters[0].getTypeName() + " is passed.");

    }

  }

  @Override

  public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info)

      throws SemanticException {

    TypeInfo[] parameters = info.getParameters();

    GenericUDAFSumEvaluator eval = (GenericUDAFSumEvaluator) getEvaluator(parameters);

    eval.setWindowing(info.isWindowing());

    eval.setSumDistinct(info.isDistinct());

    return eval;

  }

  public static PrimitiveObjectInspector.PrimitiveCategory getReturnType(TypeInfo type) {

    if (type.getCategory() != ObjectInspector.Category.PRIMITIVE) {

      return null;

    }

    switch (((PrimitiveTypeInfo) type).getPrimitiveCategory()) {

      case BYTE:

      case SHORT:

      case INT:

      case LONG:

        return PrimitiveObjectInspector.PrimitiveCategory.LONG;

      case TIMESTAMP:

      case FLOAT:

      case DOUBLE:

      case STRING:

      case VARCHAR:

      case CHAR:

        return PrimitiveObjectInspector.PrimitiveCategory.DOUBLE;

      case DECIMAL:

        return PrimitiveObjectInspector.PrimitiveCategory.DECIMAL;

    }

    return null;

  }

  /**

   * The base type for sum operator evaluator

   *

   */

  public static abstract class GenericUDAFSumEvaluator<ResultType extends Writable> extends GenericUDAFEvaluator {

    static abstract class SumAgg<T> extends AbstractAggregationBuffer {

      boolean empty;

      T sum;

      HashSet<ObjectInspectorObject> uniqueObjects; // Unique rows.

    }

    protected PrimitiveObjectInspector inputOI;

    protected PrimitiveObjectInspector outputOI;

    protected ResultType result;

    protected boolean isWindowing;

    protected boolean sumDistinct;

    public void setWindowing(boolean isWindowing) {

      this.isWindowing = isWindowing;

    }

    public void setSumDistinct(boolean sumDistinct) {

      this.sumDistinct = sumDistinct;

    }

    protected boolean isWindowingDistinct() {

      return isWindowing && sumDistinct;

    }

    @Override

    public Object terminatePartial(AggregationBuffer agg) throws HiveException {

      if (isWindowingDistinct()) {

        throw new HiveException("Distinct windowing UDAF doesn't support merge and terminatePartial");

      } else {

        return terminate(agg);

      }

    }

    /**

     * Check if the input object is eligible to contribute to the sum. If it's null

     * or the same value as the previous one for the case of SUM(DISTINCT). Then

     * skip it.

     * @param input the input object

     * @return True if sumDistinct is false or the non-null input is different from the previous object

     */

    protected boolean isEligibleValue(SumAgg agg, Object input) {

      if (input == null) {

        return false;

      }

      if (isWindowingDistinct()) {

        HashSet<ObjectInspectorObject> uniqueObjs = agg.uniqueObjects;

        ObjectInspectorObject obj = input instanceof ObjectInspectorObject ?

            (ObjectInspectorObject)input :

            new ObjectInspectorObject(

            ObjectInspectorUtils.copyToStandardObject(input, inputOI, ObjectInspectorCopyOption.JAVA),

            outputOI);

        if (!uniqueObjs.contains(obj)) {

          uniqueObjs.add(obj);

          return true;

        }

        return false;

      }

      return true;

    }

  }

  /**

   * GenericUDAFSumHiveDecimal.

   *

   */

  public static class GenericUDAFSumHiveDecimal extends GenericUDAFSumEvaluator<HiveDecimalWritable> {

    @Override

    public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {

      assert (parameters.length == 1);

      super.init(m, parameters);

      result = new HiveDecimalWritable(0);

      inputOI = (PrimitiveObjectInspector) parameters[0];

      // The output precision is 10 greater than the input which should cover at least

      // 10b rows. The scale is the same as the input.

      DecimalTypeInfo outputTypeInfo = null;

      if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {

        int precision = Math.min(HiveDecimal.MAX_PRECISION, inputOI.precision() + 10);

        outputTypeInfo = TypeInfoFactory.getDecimalTypeInfo(precision, inputOI.scale());

      } else {

        outputTypeInfo = (DecimalTypeInfo) inputOI.getTypeInfo();

      }

      ObjectInspector oi = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(outputTypeInfo);

      outputOI = (PrimitiveObjectInspector) ObjectInspectorUtils.getStandardObjectInspector(

          oi, ObjectInspectorCopyOption.JAVA);

      return oi;

    }

    /** class for storing decimal sum value. */

    @AggregationType(estimable = false) // hard to know exactly for decimals

    static class SumHiveDecimalWritableAgg extends SumAgg<HiveDecimalWritable> {

    }

    @Override

    public AggregationBuffer getNewAggregationBuffer() throws HiveException {

      SumHiveDecimalWritableAgg agg = new SumHiveDecimalWritableAgg();

      reset(agg);

      return agg;

    }

    @Override

    public void reset(AggregationBuffer agg) throws HiveException {

      SumAgg<HiveDecimalWritable> bdAgg = (SumAgg<HiveDecimalWritable>) agg;

      bdAgg.empty = true;

      bdAgg.sum = new HiveDecimalWritable(0);

      bdAgg.uniqueObjects = new HashSet<ObjectInspectorObject>();

    }

    boolean warned = false;

    @Override

    public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {

      assert (parameters.length == 1);

      try {

        if (isEligibleValue((SumHiveDecimalWritableAgg) agg, parameters[0])) {

          ((SumHiveDecimalWritableAgg)agg).empty = false;

          ((SumHiveDecimalWritableAgg)agg).sum.mutateAdd(

              PrimitiveObjectInspectorUtils.getHiveDecimal(parameters[0], inputOI));

        }

      } catch (NumberFormatException e) {

        if (!warned) {

          warned = true;

          LOG.warn(getClass().getSimpleName() + " "

              + StringUtils.stringifyException(e));

          LOG

          .warn(getClass().getSimpleName()

              + " ignoring similar exceptions.");

        }

      }

    }

    @Override

    public void merge(AggregationBuffer agg, Object partial) throws HiveException {

      if (partial != null) {

        SumHiveDecimalWritableAgg myagg = (SumHiveDecimalWritableAgg) agg;

        if (myagg.sum == null || !myagg.sum.isSet()) {

          return;

        }

        myagg.empty = false;

        if (isWindowingDistinct()) {

          throw new HiveException("Distinct windowing UDAF doesn't support merge and terminatePartial");

        } else {

          myagg.sum.mutateAdd(PrimitiveObjectInspectorUtils.getHiveDecimal(partial, inputOI));

        }

      }

    }

    @Override

    public Object terminate(AggregationBuffer agg) throws HiveException {

      SumHiveDecimalWritableAgg myagg = (SumHiveDecimalWritableAgg) agg;

      if (myagg.empty || myagg.sum == null || !myagg.sum.isSet()) {

        return null;

      }

      DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo)outputOI.getTypeInfo();

      myagg.sum.mutateEnforcePrecisionScale(decimalTypeInfo.getPrecision(), decimalTypeInfo.getScale());

      if (!myagg.sum.isSet()) {

        LOG.warn("The sum of a column with data type HiveDecimal is out of range");

        return null;

      }

      result.set(myagg.sum);

      return result;

    }

    @Override

    public GenericUDAFEvaluator getWindowingEvaluator(WindowFrameDef wFrameDef) {

      // Don't use streaming for distinct cases

      if (sumDistinct) {

        return null;

      }

      return new GenericUDAFStreamingEvaluator.SumAvgEnhancer<HiveDecimalWritable, HiveDecimal>(

          this, wFrameDef) {

        @Override

        protected HiveDecimalWritable getNextResult(

            org.apache.hadoop.hive.ql.udf.generic.GenericUDAFStreamingEvaluator.SumAvgEnhancer<HiveDecimalWritable, HiveDecimal>.SumAvgStreamingState ss)

            throws HiveException {

          SumHiveDecimalWritableAgg myagg = (SumHiveDecimalWritableAgg) ss.wrappedBuf;

          HiveDecimal r = myagg.empty ? null : myagg.sum.getHiveDecimal();

          HiveDecimal d = ss.retrieveNextIntermediateValue();

          if (d != null ) {

            r = r == null ? null : r.subtract(d);

          }

          return r == null ? null : new HiveDecimalWritable(r);

        }

        @Override

        protected HiveDecimal getCurrentIntermediateResult(

            org.apache.hadoop.hive.ql.udf.generic.GenericUDAFStreamingEvaluator.SumAvgEnhancer<HiveDecimalWritable, HiveDecimal>.SumAvgStreamingState ss)

            throws HiveException {

          SumHiveDecimalWritableAgg myagg = (SumHiveDecimalWritableAgg) ss.wrappedBuf;

          return myagg.empty ? null : myagg.sum.getHiveDecimal();

        }

      };

    }

  }

  /**

   * GenericUDAFSumDouble.

   *

   */

  public static class GenericUDAFSumDouble extends GenericUDAFSumEvaluator<DoubleWritable> {

    @Override

    public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {

      assert (parameters.length == 1);

      super.init(m, parameters);

      result = new DoubleWritable(0);

      inputOI = (PrimitiveObjectInspector) parameters[0];

      outputOI = (PrimitiveObjectInspector)ObjectInspectorUtils.getStandardObjectInspector(inputOI,

          ObjectInspectorCopyOption.JAVA);

      return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;

    }

    /** class for storing double sum value. */

    @AggregationType(estimable = true)

    static class SumDoubleAgg extends SumAgg<Double> {

      @Override

      public int estimate() { return JavaDataModel.PRIMITIVES1 + JavaDataModel.PRIMITIVES2; }

    }

    @Override

    public AggregationBuffer getNewAggregationBuffer() throws HiveException {

      SumDoubleAgg result = new SumDoubleAgg();

      reset(result);

      return result;

    }

    @Override

    public void reset(AggregationBuffer agg) throws HiveException {

      SumDoubleAgg myagg = (SumDoubleAgg) agg;

      myagg.empty = true;

      myagg.sum = 0.0;

      myagg.uniqueObjects = new HashSet<ObjectInspectorObject>();

    }

    boolean warned = false;

    @Override

    public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {

      assert (parameters.length == 1);

      try {

        if (isEligibleValue((SumDoubleAgg) agg, parameters[0])) {

          ((SumDoubleAgg)agg).empty = false;

          ((SumDoubleAgg)agg).sum += PrimitiveObjectInspectorUtils.getDouble(parameters[0], inputOI);

        }

      } catch (NumberFormatException e) {

        if (!warned) {

          warned = true;

          LOG.warn(getClass().getSimpleName() + " "

              + StringUtils.stringifyException(e));

          LOG

          .warn(getClass().getSimpleName()

              + " ignoring similar exceptions.");

        }

      }

    }

    @Override

    public void merge(AggregationBuffer agg, Object partial) throws HiveException {

      if (partial != null) {

        SumDoubleAgg myagg = (SumDoubleAgg) agg;

        myagg.empty = false;

        if (isWindowingDistinct()) {

          throw new HiveException("Distinct windowing UDAF doesn't support merge and terminatePartial");

        } else {

          myagg.sum += PrimitiveObjectInspectorUtils.getDouble(partial, inputOI);

        }

      }

    }

    @Override

    public Object terminate(AggregationBuffer agg) throws HiveException {

      SumDoubleAgg myagg = (SumDoubleAgg) agg;

      if (myagg.empty) {

        return null;

      }

      result.set(myagg.sum);

      return result;

    }

    @Override

    public GenericUDAFEvaluator getWindowingEvaluator(WindowFrameDef wFrameDef) {

      // Don't use streaming for distinct cases

      if (sumDistinct) {

        return null;

      }

      return new GenericUDAFStreamingEvaluator.SumAvgEnhancer<DoubleWritable, Double>(this,

          wFrameDef) {

        @Override

        protected DoubleWritable getNextResult(

            org.apache.hadoop.hive.ql.udf.generic.GenericUDAFStreamingEvaluator.SumAvgEnhancer<DoubleWritable, Double>.SumAvgStreamingState ss)

            throws HiveException {

          SumDoubleAgg myagg = (SumDoubleAgg) ss.wrappedBuf;

          Double r = myagg.empty ? null : myagg.sum;

          Double d = ss.retrieveNextIntermediateValue();

          if (d != null) {

            r = r == null ? null : r - d;

          }

          return r == null ? null : new DoubleWritable(r);

        }

        @Override

        protected Double getCurrentIntermediateResult(

            org.apache.hadoop.hive.ql.udf.generic.GenericUDAFStreamingEvaluator.SumAvgEnhancer<DoubleWritable, Double>.SumAvgStreamingState ss)

            throws HiveException {

          SumDoubleAgg myagg = (SumDoubleAgg) ss.wrappedBuf;

          return myagg.empty ? null : new Double(myagg.sum);

        }

      };

    }

  }

  /**

   * GenericUDAFSumLong.

   *

   */

  public static class GenericUDAFSumLong extends GenericUDAFSumEvaluator<LongWritable> {

    @Override

    public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {

      assert (parameters.length == 1);

      super.init(m, parameters);

      result = new LongWritable(0);

      inputOI = (PrimitiveObjectInspector) parameters[0];

      outputOI = (PrimitiveObjectInspector)ObjectInspectorUtils.getStandardObjectInspector(inputOI,

          ObjectInspectorCopyOption.JAVA);

      return PrimitiveObjectInspectorFactory.writableLongObjectInspector;

    }

    /** class for storing double sum value. */

    @AggregationType(estimable = true)

    static class SumLongAgg extends SumAgg<Long> {

      @Override

      public int estimate() { return JavaDataModel.PRIMITIVES1 + JavaDataModel.PRIMITIVES2; }

    }

    @Override

    public AggregationBuffer getNewAggregationBuffer() throws HiveException {

      SumLongAgg result = new SumLongAgg();

      reset(result);

      return result;

    }

    @Override

    public void reset(AggregationBuffer agg) throws HiveException {

      SumLongAgg myagg = (SumLongAgg) agg;

      myagg.empty = true;

      myagg.sum = 0L;

      myagg.uniqueObjects = new HashSet<ObjectInspectorObject>();

    }

    private boolean warned = false;

    @Override

    public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {

      assert (parameters.length == 1);

      try {

        if (isEligibleValue((SumLongAgg) agg, parameters[0])) {

          ((SumLongAgg)agg).empty = false;

          ((SumLongAgg)agg).sum += PrimitiveObjectInspectorUtils.getLong(parameters[0], inputOI);

        }

      } catch (NumberFormatException e) {

        if (!warned) {

          warned = true;

          LOG.warn(getClass().getSimpleName() + " "

              + StringUtils.stringifyException(e));

        }

      }

    }

    @Override

    public void merge(AggregationBuffer agg, Object partial) throws HiveException {

      if (partial != null) {

        SumLongAgg myagg = (SumLongAgg) agg;

        myagg.empty = false;

        if (isWindowingDistinct()) {

          throw new HiveException("Distinct windowing UDAF doesn't support merge and terminatePartial");

        } else {

            myagg.sum += PrimitiveObjectInspectorUtils.getLong(partial, inputOI);

        }

      }

    }

    @Override

    public Object terminate(AggregationBuffer agg) throws HiveException {

      SumLongAgg myagg = (SumLongAgg) agg;

      if (myagg.empty) {

        return null;

      }

      result.set(myagg.sum);

      return result;

    }

    @Override

    public GenericUDAFEvaluator getWindowingEvaluator(WindowFrameDef wFrameDef) {

      // Don't use streaming for distinct cases

      if (isWindowingDistinct()) {

        return null;

      }

      return new GenericUDAFStreamingEvaluator.SumAvgEnhancer<LongWritable, Long>(this,

          wFrameDef) {

        @Override

        protected LongWritable getNextResult(

            org.apache.hadoop.hive.ql.udf.generic.GenericUDAFStreamingEvaluator.SumAvgEnhancer<LongWritable, Long>.SumAvgStreamingState ss)

            throws HiveException {

          SumLongAgg myagg = (SumLongAgg) ss.wrappedBuf;

          Long r = myagg.empty ? null : myagg.sum;

          Long d = ss.retrieveNextIntermediateValue();

          if (d != null) {

            r = r == null ? null : r - d;

          }

          return r == null ? null : new LongWritable(r);

        }

        @Override

        protected Long getCurrentIntermediateResult(

            org.apache.hadoop.hive.ql.udf.generic.GenericUDAFStreamingEvaluator.SumAvgEnhancer<LongWritable, Long>.SumAvgStreamingState ss)

            throws HiveException {

          SumLongAgg myagg = (SumLongAgg) ss.wrappedBuf;

          return myagg.empty ? null : new Long(myagg.sum);

        }

      };

    }

  }

}

　　ddd

GenericUDAF

/**

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

package org.apache.hadoop.hive.ql.udf.generic;

import java.io.Closeable;

import java.io.IOException;

import java.lang.annotation.Retention;

import java.lang.annotation.RetentionPolicy;

import org.apache.hadoop.hive.ql.exec.MapredContext;

import org.apache.hadoop.hive.ql.metadata.HiveException;

import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef;

import org.apache.hadoop.hive.ql.udf.UDFType;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

import org.apache.hive.common.util.AnnotationUtils;

/**

 * A Generic User-defined aggregation function (GenericUDAF) for the use with

 * Hive.

 *

 * New GenericUDAF classes need to inherit from this GenericUDAF class.

 *

 * The GenericUDAF are superior to normal UDAFs in the following ways: 1. It can

 * accept arguments of complex types, and return complex types. 2. It can accept

 * variable length of arguments. 3. It can accept an infinite number of function

 * signature - for example, it's easy to write a GenericUDAF that accepts

 * array<int>, array<array<int>> and so on (arbitrary levels of nesting).

 */

@UDFType(deterministic = true)

public abstract class GenericUDAFEvaluator implements Closeable {

  @Retention(RetentionPolicy.RUNTIME)

  public static @interface AggregationType {

    boolean estimable() default false;

  }

  public static boolean isEstimable(AggregationBuffer buffer) {

    if (buffer instanceof AbstractAggregationBuffer) {

      Class<? extends AggregationBuffer> clazz = buffer.getClass();

      AggregationType annotation = AnnotationUtils.getAnnotation(clazz, AggregationType.class);

      return annotation != null && annotation.estimable();

    }

    return false;

  }

  /**

   * Mode.

   *

   */

  public static enum Mode {

    /**

     * PARTIAL1: from original data to partial aggregation data: iterate() and

     * terminatePartial() will be called.

     */

    PARTIAL1,  相当于map阶段，调用iterate()和terminatePartial() 

        /**

     * PARTIAL2: from partial aggregation data to partial aggregation data:

     * merge() and terminatePartial() will be called.

     */

    PARTIAL2,  相当于combiner阶段，调用merge()和terminatePartial() 

        /**

     * FINAL: from partial aggregation to full aggregation: merge() and

     * terminate() will be called.

     */

    FINAL,  相当于reduce阶段调用merge()和terminate() 

        /**

     * COMPLETE: from original data directly to full aggregation: iterate() and

     * terminate() will be called.

     */

    COMPLETE COMPLETE: 相当于没有reduce阶段map，调用iterate()和terminate() 

  };

  Mode mode;

  /**

   * The constructor.

   */

  public GenericUDAFEvaluator() {

  }

  /**

   * Additionally setup GenericUDAFEvaluator with MapredContext before initializing.

   * This is only called in runtime of MapRedTask.

   *

   * @param mapredContext context

   */

  public void configure(MapredContext mapredContext) {

  }

  /**

   * Initialize the evaluator.

   *

   * @param m mode Init方式 mode在初始四个方法需要的调用或者初始化

   *          The mode of aggregation.

   * @param parameters

   *          The ObjectInspector for the parameters: In PARTIAL1 and COMPLETE  在partial1 complelte 存储是初始化数据，原理很简单。parital1是map complete 是没有map 的reduce

   *          mode, the parameters are original data; In PARTIAL2 and FINAL

   *          mode, the parameters are just partial aggregations (in that case,剩下两个是聚合后的数据。

   *          the array will always have a single element).

   * @return The ObjectInspector for the return value. In PARTIAL1 and PARTIAL2

   *         mode, the ObjectInspector for the return value of

   *         terminatePartial() call; In FINAL and COMPLETE mode, the

   *         ObjectInspector for the return value of terminate() call.

   *

   *         NOTE: We need ObjectInspector[] (in addition to the TypeInfo[] in

   *         GenericUDAFResolver) for 2 reasons: 1. ObjectInspector contains

   *         more information than TypeInfo; and GenericUDAFEvaluator.init at

   *         execution time. 2. We call GenericUDAFResolver.getEvaluator at

   *         compilation time,

   */

  public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {

    // This function should be overriden in every sub class

    // And the sub class should call super.init(m, parameters) to get mode set.

    mode = m;

    return null;

  }

  /**

   * The interface for a class that is used to store the aggregation result

   * during the process of aggregation.

   *

   * We split this piece of data out because there can be millions of instances

   * of this Aggregation in hash-based aggregation process, and it's very

   * important to conserve memory.

   *

   * In the future, we may completely hide this class inside the Evaluator and

   * use integer numbers to identify which aggregation we are looking at.

   *

   * @deprecated use {@link AbstractAggregationBuffer} instead

   */

  public static interface AggregationBuffer {

  };

  public static abstract class AbstractAggregationBuffer implements AggregationBuffer {

    /**

     * Estimate the size of memory which is occupied by aggregation buffer.

     * Currently, hive assumes that primitives types occupies 16 byte and java object has

     * 64 byte overhead for each. For map, each entry also has 64 byte overhead.

     */

    public int estimate() { return -1; }

  }

  /**

   * Get a new aggregation object.

   */

  public abstract AggregationBuffer getNewAggregationBuffer() throws HiveException;

  /**

   * Reset the aggregation. This is useful if we want to reuse the same

   * aggregation.

   */

  public abstract void reset(AggregationBuffer agg) throws HiveException;

  /**

   * Close GenericUDFEvaluator.

   * This is only called in runtime of MapRedTask.

   */

  public void close() throws IOException {

  }

  /**

   * This function will be called by GroupByOperator when it sees a new input

   * row.

   *

   * @param agg

   *          The object to store the aggregation result.

   * @param parameters

   *          The row, can be inspected by the OIs passed in init().

   */

  public void aggregate(AggregationBuffer agg, Object[] parameters) throws HiveException {

    if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {

      iterate(agg, parameters);

    } else {

      assert (parameters.length == 1);

      merge(agg, parameters[0]);

    }

  }

  /**

   * This function will be called by GroupByOperator when it sees a new input

   * row.

   *

   * @param agg

   *          The object to store the aggregation result.

   */

  public Object evaluate(AggregationBuffer agg) throws HiveException {

    if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {

      return terminatePartial(agg);

    } else {

      return terminate(agg);

    }

  }

  /**

   * Iterate through original data.

   *

   * @param parameters

   *          The objects of parameters.

   */

  public abstract void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException;

  /**

   * Get partial aggregation result.

   *

   * @return partial aggregation result.

   */

  public abstract Object terminatePartial(AggregationBuffer agg) throws HiveException;

  /**

   * Merge with partial aggregation result. NOTE: null might be passed in case

   * there is no input data.

   *

   * @param partial

   *          The partial aggregation result.

   */

  public abstract void merge(AggregationBuffer agg, Object partial) throws HiveException;

  /**

   * Get final aggregation result.

   *

   * @return final aggregation result.

   */

  public abstract Object terminate(AggregationBuffer agg) throws HiveException;

  /**

   * When evaluating an aggregates over a fixed Window, the naive way to compute

   * results is to compute the aggregate for each row. But often there is a way

   * to compute results in a more efficient manner. This method enables the

   * basic evaluator to provide a function object that does the job in a more

   * efficient manner.

   * <p>

   * This method is called after this Evaluator is initialized. The returned

   * Function must be initialized. It is passed the 'window' of aggregation for

   * each row.

   *

   * @param wFrmDef

   *          the Window definition in play for this evaluation.

   * @return null implies that this fn cannot be processed in Streaming mode. So

   *         each row is evaluated independently.

   */

  public GenericUDAFEvaluator getWindowingEvaluator(WindowFrameDef wFrmDef) {

    return null;

  }

}

　　http://paddy-w.iteye.com/blog/2081409

秒客网

hive UDAF源代码分析

相关文章