package net.thornydev; import java.io.BufferedReader; import java.io.FileReader; import java.util.Iterator; import java.util.SortedSet; import java.util.TreeSet; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; /** * Generates Hive schemas for use with the JSON SerDe from * org.openx.data.jsonserde.JsonSerDe. GitHub link: https://github.com/rcongiu/Hive-JSON-Serde * * Pass in a valid JSON document string to {@link JsonHiveSchema#createHiveSchema} and it will * return a Hive schema for the JSON document. * * It supports embedded JSON objects, arrays and the standard JSON scalar types: strings, * numbers, booleans and null. You probably don't want null in the JSON document you provide * as Hive can't use that. For numbers - if the example value has a decimal, it will be * typed as "double". If the number has no decimal, it will be typed as "int". * * This program uses the JSON parsing code from json.org and that code is included in this * library, since it has not been packaged and made available for maven/ivy/gradle dependency * resolution. * * Use of main method:
* JsonHiveSchema has a main method that takes a file path to a JSON doc - this file should have * only one JSON file in it. An optional second argument can be provided to name the Hive table * that is generated. */ public class JsonHiveSchema { static void help() { System.out.println("Usage: Two arguments possible. First is required. Second is optional"); System.out.println(" 1st arg: path to JSON file to parse into Hive schema"); System.out.println(" 2nd arg (optional): tablename. Defaults to 'x'"); } public static void main( String[] args ) throws Exception { if (args.length == 0) { throw new IllegalArgumentException("ERROR: No file specified"); } if (args[0].equals("-h")) { help(); System.exit(0); } StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader( new FileReader(args[0]) ); String line; while ( (line = br.readLine()) != null ) { sb.append(line).append("\n"); } br.close(); String tableName = "x"; if (args.length == 2) { tableName = args[1]; } JsonHiveSchema schemaWriter = new JsonHiveSchema(tableName); System.out.println(schemaWriter.createHiveSchema(sb.toString())); } private String tableName = "x"; public JsonHiveSchema() {} public JsonHiveSchema(String tableName) { this.tableName = tableName; } /** * Pass in any valid JSON object and a Hive schema will be returned for it. * You should avoid having null values in the JSON document, however. * * The Hive schema columns will be printed in alphabetical order - overall and * within subsections. * * @param json * @return string Hive schema * @throws JSONException if the JSON does not parse correctly * OLD serde jar: org.openx.data.jsonserde.JsonSerDe * NEW serde jar: org.apache.hive.hcatalog.data.JsonSerDe */ public String createHiveSchema(String json) throws JSONException { JSONObject jo = new JSONObject(json); @SuppressWarnings("unchecked") Iterator keys = jo.keys(); keys = new OrderedIterator(keys); StringBuilder sb = new StringBuilder("CREATE TABLE ").append(tableName).append(" (\n"); while (keys.hasNext()) { String k = keys.next(); sb.append(" "); sb.append(k.toString()); sb.append(' '); sb.append(valueToHiveSchema(jo.opt(k))); sb.append(',').append("\n"); } sb.replace(sb.length() - 2, sb.length(), ")\n"); // remove last comma return sb.append("ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';").toString(); } private String toHiveSchema(JSONObject o) throws JSONException { @SuppressWarnings("unchecked") Iterator keys = o.keys(); keys = new OrderedIterator(keys); StringBuilder sb = new StringBuilder("struct<"); while (keys.hasNext()) { String k = keys.next(); sb.append(k.toString()); sb.append(':'); sb.append(valueToHiveSchema(o.opt(k))); sb.append(", "); } sb.replace(sb.length() - 2, sb.length(), ">"); // remove last comma return sb.toString(); } private String toHiveSchema(JSONArray a) throws JSONException { return "array<" + arrayJoin(a, ",") + '>'; } private String arrayJoin(JSONArray a, String separator) throws JSONException { StringBuilder sb = new StringBuilder(); if (a.length() == 0) { throw new IllegalStateException("Array is empty: " + a.toString()); } Object entry0 = a.get(0); if ( isScalar(entry0) ) { sb.append( scalarType(entry0) ); } else if (entry0 instanceof JSONObject) { sb.append( toHiveSchema((JSONObject)entry0) ); } else if (entry0 instanceof JSONArray) { sb.append( toHiveSchema((JSONArray)entry0) ); } return sb.toString(); } private String scalarType(Object o) { if (o instanceof String) return "string"; if (o instanceof Number) return scalarNumericType(o); if (o instanceof Boolean) return "boolean"; return null; } private String scalarNumericType(Object o) { String s = o.toString(); if (s.indexOf('.') > 0) { return "double"; } else { return "int"; } } private boolean isScalar(Object o) { return o instanceof String || o instanceof Number || o instanceof Boolean || o == JSONObject.NULL; } private String valueToHiveSchema(Object o) throws JSONException { if ( isScalar(o) ) { return scalarType(o); } else if (o instanceof JSONObject) { return toHiveSchema((JSONObject)o); } else if (o instanceof JSONArray) { return toHiveSchema((JSONArray)o); } else { throw new IllegalArgumentException("unknown type: " + o.getClass()); } } static class OrderedIterator implements Iterator { Iterator it; public OrderedIterator(Iterator iter) { SortedSet keys = new TreeSet(); while (iter.hasNext()) { keys.add(iter.next()); } it = keys.iterator(); } public boolean hasNext() { return it.hasNext(); } public String next() { return it.next(); } public void remove() { it.remove(); } } }