目录
一、get_json_object 使用
二、使用案例
三、源码分析
四、总结
大家好,我是老六。
在数据开发中,我们有大量解析json串的需求,我们选用的UDF函数无非就是:get_json_object和json_tuple。但是在使用get_json_object函数过程中,老六发现get_json_object无法解析key为中文的key:value对。带着这个问题,老六通过源码研究了get_json_object这个函数,探索其中奥秘。
一、get_json_object 使用
- 语法:get_json_object(json_string, '$.key')
- 说明:解析json的字符串json_string,返回path指定的内容。如果输入的json字符串无效,那么返回NULL。这个函数每次只能返回一个数据项。
二、使用案例
本案例是在hive1.2.1版本下实测,可供参考,若有出入,请对照相应版本源码。
构造JSON串:{"gg今日":"gg今日","test":"test","gg":"gg","今日":"今日"}
1、正常案例
SQL:select get_json_object('{"gg今日":"gg今日","test":"test","gg":"gg","今日":"今日"}',"$.test");
结果:
结果很正常,就是解析json,通过key来取value
2、异常案例
1)SQL:select get_json_object('{"gg今日":"gg今日","test":"test","gg":"gg","今日":"今日"}',"$.今日");
结果:
结果开始有些不对了,按照函数使用来说,我理解是可以正常取到 今日:今日键值对,但是结果是null,开始好奇,想搞明白原因是什么。
再次测试
2)SQL:select get_json_object('{"gg今日":"gg今日","test":"test","gg":"gg","今日":"今日"}',"$.gg今日");
结果:
这就很神奇了, 以 'gg今日' 作为键却取出了以'gg'作为键的结果,缘分真是妙不可言呐。
通过测试我们发现两个问题:
1)中文作为key,get_json_object无法取出value值(见 2.1)。
2)以英文开头的混有中文的key,取出了英文部分key对应的value(见 2.2)。
三、源码分析
通过查看源码我们发现get_json_object对应的UDF类的源码如下:
/** * UDFJson. * */ @Description(name = "get_json_object", value = "_FUNC_(json_txt, path) - Extract a json object from path ", extended = "Extract json object from a json string based on json path " + "specified, and return json string of the extracted json object. It " + "will return null if the input json string is invalid.\n" + "A limited version of JSONPath supported:\n" + " $ : Root object\n" + " . : Child operator\n" + " [] : Subscript operator for array\n" + " * : Wildcard for []\n" + "Syntax not supported that's worth noticing:\n" + " '' : Zero length string as key\n" + " .. : Recursive descent\n" + " @ : Current object/element\n" + " () : Script expression\n" + " ?() : Filter (script) expression.\n" + " [,] : Union operator\n" + " [start:end:step] : array slice operator\n") public class UDFJson extends UDF { private final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*"); private final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]"); private static final JsonFactory JSON_FACTORY = new JsonFactory(); static { // Allows for unescaped ASCII control characters in JSON values JSON_FACTORY.enable(Feature.ALLOW_UNQUOTED_CONTROL_CHARS); } private static final ObjectMapper MAPPER = new ObjectMapper(JSON_FACTORY); private static final JavaType MAP_TYPE = TypeFactory.fromClass(Map.class); private static final JavaType LIST_TYPE = TypeFactory.fromClass(List.class); // An LRU cache using a linked hash map static class HashCache
extends LinkedHashMap { private static final int CACHE_SIZE = 16; private static final int INIT_SIZE = 32; private static final float LOAD_FACTOR = 0.6f; HashCache() { super(INIT_SIZE, LOAD_FACTOR); } private static final long serialVersionUID = 1; @Override protected boolean removeEldestEntry(Map.Entry eldest) { return size() > CACHE_SIZE; } } static Map extractObjectCache = new HashCache (); static Map pathExprCache = new HashCache (); static Map > indexListCache = new HashCache >(); static Map mKeyGroup1Cache = new HashCache (); static Map mKeyMatchesCache = new HashCache (); Text result = new Text(); public UDFJson() { } /** * Extract json object from a json string based on json path specified, and * return json string of the extracted json object. It will return null if the * input json string is invalid. * * A limited version of JSONPath supported: $ : Root object . : Child operator * [] : Subscript operator for array * : Wildcard for [] * * Syntax not supported that's worth noticing: '' : Zero length string as key * .. : Recursive descent @ : Current object/element () : Script * expression ?() : Filter (script) expression. [,] : Union operator * [start:end:step] : array slice operator * * @param jsonString * the json string. * @param pathString * the json path expression. * @return json string or null when an error happens. */ public Text evaluate(String jsonString, String pathString) { if (jsonString == null || jsonString.isEmpty() || pathString == null || pathString.isEmpty() || pathString.charAt(0) != '$') { return null; } int pathExprStart = 1; boolean isRootArray = false; if (pathString.length() > 1) { if (pathString.charAt(1) == '[') { pathExprStart = 0; isRootArray = true; } else if (pathString.charAt(1) == '.') { isRootArray = pathString.length() > 2 && pathString.charAt(2) == '['; } else { return null; } } // Cache pathExpr String[] pathExpr = pathExprCache.get(pathString); if (pathExpr == null) { pathExpr = pathString.split("\\.", -1); pathExprCache.put(pathString, pathExpr); } // Cache extractObject Object extractObject = extractObjectCache.get(jsonString); if (extractObject == null) { JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE; try { extractObject = MAPPER.readValue(jsonString, javaType); } catch (Exception e) { return null; } extractObjectCache.put(jsonString, extractObject); } for (int i = pathExprStart; i < pathExpr.length; i++) { if (extractObject == null) { return null; } extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray); } if (extractObject instanceof Map || extractObject instanceof List) { try { result.set(MAPPER.writeValueAsString(extractObject)); } catch (Exception e) { return null; } } else if (extractObject != null) { result.set(extractObject.toString()); } else { return null; } return result; } private Object extract(Object json, String path, boolean skipMapProc) { // skip MAP processing for the first path element if root is array if (!skipMapProc) { // Cache patternkey.matcher(path).matches() Matcher mKey = null; Boolean mKeyMatches = mKeyMatchesCache.get(path); if (mKeyMatches == null) { mKey = patternKey.matcher(path); mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE; mKeyMatchesCache.put(path, mKeyMatches); } if (!mKeyMatches.booleanValue()) { return null; } // Cache mkey.group(1) String mKeyGroup1 = mKeyGroup1Cache.get(path); if (mKeyGroup1 == null) { if (mKey == null) { mKey = patternKey.matcher(path); mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE; mKeyMatchesCache.put(path, mKeyMatches); if (!mKeyMatches.booleanValue()) { return null; } } mKeyGroup1 = mKey.group(1); mKeyGroup1Cache.put(path, mKeyGroup1); } json = extract_json_withkey(json, mKeyGroup1); } // Cache indexList ArrayList indexList = indexListCache.get(path); if (indexList == null) { Matcher mIndex = patternIndex.matcher(path); indexList = new ArrayList (); while (mIndex.find()) { indexList.add(mIndex.group(1)); } indexListCache.put(path, indexList); } if (indexList.size() > 0) { json = extract_json_withindex(json, indexList); } return json; } private transient AddingList jsonList = new AddingList(); private static class AddingList extends ArrayList private final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\-\:\s]+).*"); 这个就是匹配key的模式串,它的意思是匹配以数字、字母、_、-、:、空格 为开头的字符串。那么这个匹配模式串就决定了,get_json_object函数无法匹配出key是纯中文的键值对,即select get_json_object('{"gg今日":"gg今日","test":"test","gg":"gg","今日":"今日"}',"$.今日") 结果为null; 对于 select get_json_object('{"gg今日":"gg今日","test":"test","gg":"gg","今日":"今日"}',"$.gg今日"),由于匹配key的模式串是匹配以数字、字母、_、-、:、空格 为开头的字符串,所以它将'gg今日'中的'gg'匹配出来,然后作为key去取value,所以就取出了 "gg":"gg" 键值对,即答案为'gg'。
四、总结
实践是检验真理的唯一标准。遇到问题,我们就去它的源头寻找答案。
猜你喜欢
网友评论
- 搜索
- 最新文章
- 热门文章