GenericUDF提供了更好的参数和返回值检查, 效率更高, 适合处理HIVE中的复杂数据类型
把字符串变成词向量, 例如:
"This is a sentence"->{'This':1, 'is':1, 'a':1, 'sentence':1}
对于外部依赖, 为了让集群的每个节点都能执行jar, 可以用eclipse export Runnable Jar File
package cn.pywei.HiveUDF;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
@Description(name="WordsArray",value="_FUNC_(string), return the word array by using GenericUDF.")
public class WordArray extends GenericUDF {
private final Map<Text, IntWritable> sortMap = new HashMap<Text, IntWritable>();
private ObjectInspectorConverters.Converter converter;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
// check the input argument count
if (arguments.length != 1) {
throw new UDFArgumentException("Param must be 1 argu.");
}
// check the input argument type
if (arguments[0].getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(1, "A string argument was expected.");
}
PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) arguments[0]).getPrimitiveCategory();
if (primitiveCategory != PrimitiveCategory.STRING
&& primitiveCategory != PrimitiveCategory.CHAR
&& primitiveCategory != PrimitiveCategory.VARCHAR
&& primitiveCategory != PrimitiveCategory.VOID) {
throw new UDFArgumentTypeException(1,
"A string, char, varchar or null argument was expected");
}
// generate a converter for the argument to use in the evaluate function
converter = ObjectInspectorConverters.getConverter(arguments[0],PrimitiveObjectInspectorFactory.writableStringObjectInspector);
// return the inspector to check the return value of evaluate function
return ObjectInspectorFactory.getStandardMapObjectInspector(
PrimitiveObjectInspectorFactory.writableStringObjectInspector,
PrimitiveObjectInspectorFactory.writableIntObjectInspector);
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
// check if the argument is null
if (arguments[0].get() == null) {
return sortMap;
}
// populate the word array
Text s = (Text) converter.convert(arguments[0].get());
String[] ss = s.toString().split(" ");
for (String i : ss) {
if (StringUtils.isBlank(i)) {
continue;
}
if(sortMap.containsKey(new Text(i))) {
sortMap.replace(new Text(i), new IntWritable(sortMap.get(new Text(i)).get()+1));
}
else {
sortMap.put(new Text(i), new IntWritable(1));
}
}
return sortMap;
}
@Override
public String getDisplayString(String[] children) {
// generate the logs to show in the HQL explain clause
return children[0];
}
}
---------------------本文来自 爱知菜 的CSDN 博客 ,全文地址请点击:https://blog.csdn.net/rav009/art ... 965?utm_source=copy
|
|