aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/jsonb_gin.c
diff options
context:
space:
mode:
authorAndrew Dunstan <andrew@dunslane.net>2014-03-23 16:40:19 -0400
committerAndrew Dunstan <andrew@dunslane.net>2014-03-23 16:40:19 -0400
commitd9134d0a355cfa447adc80db4505d5931084278a (patch)
treecefe155d0f0f71b9279444a86eab4b1b1facafdb /src/backend/utils/adt/jsonb_gin.c
parentb2b2491b06074e68fc7c96148cb0fdf0c8eb0469 (diff)
downloadpostgresql-d9134d0a355cfa447adc80db4505d5931084278a.tar.gz
postgresql-d9134d0a355cfa447adc80db4505d5931084278a.zip
Introduce jsonb, a structured format for storing json.
The new format accepts exactly the same data as the json type. However, it is stored in a format that does not require reparsing the orgiginal text in order to process it, making it much more suitable for indexing and other operations. Insignificant whitespace is discarded, and the order of object keys is not preserved. Neither are duplicate object keys kept - the later value for a given key is the only one stored. The new type has all the functions and operators that the json type has, with the exception of the json generation functions (to_json, json_agg etc.) and with identical semantics. In addition, there are operator classes for hash and btree indexing, and two classes for GIN indexing, that have no equivalent in the json type. This feature grew out of previous work by Oleg Bartunov and Teodor Sigaev, which was intended to provide similar facilities to a nested hstore type, but which in the end proved to have some significant compatibility issues. Authors: Oleg Bartunov, Teodor Sigaev, Peter Geoghegan and Andrew Dunstan. Review: Andres Freund
Diffstat (limited to 'src/backend/utils/adt/jsonb_gin.c')
-rw-r--r--src/backend/utils/adt/jsonb_gin.c646
1 files changed, 646 insertions, 0 deletions
diff --git a/src/backend/utils/adt/jsonb_gin.c b/src/backend/utils/adt/jsonb_gin.c
new file mode 100644
index 00000000000..4a6b8fd6888
--- /dev/null
+++ b/src/backend/utils/adt/jsonb_gin.c
@@ -0,0 +1,646 @@
+/*-------------------------------------------------------------------------
+ *
+ * jsonb_gin.c
+ * GIN support functions for jsonb
+ *
+ * Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/jsonb_gin.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/gin.h"
+#include "access/skey.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/jsonb.h"
+
+typedef struct PathHashStack
+{
+ uint32 hash;
+ struct PathHashStack *parent;
+} PathHashStack;
+
+static text *make_text_key(const char *str, int len, char flag);
+static text *make_scalar_key(const JsonbValue * scalarVal, char flag);
+
+/*
+ *
+ * jsonb_ops GIN opclass support functions
+ *
+ */
+Datum
+gin_compare_jsonb(PG_FUNCTION_ARGS)
+{
+ text *arg1 = PG_GETARG_TEXT_PP(0);
+ text *arg2 = PG_GETARG_TEXT_PP(1);
+ int32 result;
+ char *a1p,
+ *a2p;
+ int len1,
+ len2;
+
+ a1p = VARDATA_ANY(arg1);
+ a2p = VARDATA_ANY(arg2);
+
+ len1 = VARSIZE_ANY_EXHDR(arg1);
+ len2 = VARSIZE_ANY_EXHDR(arg2);
+
+ /* Compare text as bttextcmp does, but always using C collation */
+ result = varstr_cmp(a1p, len1, a2p, len2, C_COLLATION_OID);
+
+ PG_FREE_IF_COPY(arg1, 0);
+ PG_FREE_IF_COPY(arg2, 1);
+
+ PG_RETURN_INT32(result);
+}
+
+Datum
+gin_extract_jsonb(PG_FUNCTION_ARGS)
+{
+ Jsonb *jb = (Jsonb *) PG_GETARG_JSONB(0);
+ int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ Datum *entries = NULL;
+ int total = 2 * JB_ROOT_COUNT(jb);
+ int i = 0,
+ r;
+ JsonbIterator *it;
+ JsonbValue v;
+
+ if (total == 0)
+ {
+ *nentries = 0;
+ PG_RETURN_POINTER(NULL);
+ }
+
+ entries = (Datum *) palloc(sizeof(Datum) * total);
+
+ it = JsonbIteratorInit(VARDATA(jb));
+
+ while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
+ {
+ if (i >= total)
+ {
+ total *= 2;
+ entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
+ }
+
+ /*
+ * Serialize keys and elements equivalently, but only when elements
+ * are Jsonb strings. Otherwise, serialize elements as values. Array
+ * elements are indexed as keys, for the benefit of
+ * JsonbExistsStrategyNumber. Our definition of existence does not
+ * allow for checking the existence of a non-jbvString element (just
+ * like the definition of the underlying operator), because the
+ * operator takes a text rhs argument (which is taken as a proxy for an
+ * equivalent Jsonb string).
+ *
+ * The way existence is represented does not preclude an alternative
+ * existence operator, that takes as its rhs value an arbitrarily
+ * internally-typed Jsonb. The only reason that isn't the case here is
+ * that the existence operator is only really intended to determine if
+ * an object has a certain key (object pair keys are of course
+ * invariably strings), which is extended to jsonb arrays. You could
+ * think of the default Jsonb definition of existence as being
+ * equivalent to a definition where all types of scalar array elements
+ * are keys that we can check the existence of, while just forbidding
+ * non-string notation. This inflexibility prevents the user from
+ * having to qualify that the rhs string is a raw scalar string (that
+ * is, naturally no internal string quoting in required for the text
+ * argument), and allows us to not set the reset flag for
+ * JsonbExistsStrategyNumber, since we know that keys are strings for
+ * both objects and arrays, and don't have to further account for type
+ * mismatch. Not having to set the reset flag makes it less than
+ * tempting to tighten up the definition of existence to preclude array
+ * elements entirely, which would arguably be a simpler alternative.
+ * In any case the infrastructure used to implement the existence
+ * operator could trivially support this hypothetical, slightly
+ * distinct definition of existence.
+ */
+ switch (r)
+ {
+ case WJB_KEY:
+ /* Serialize key separately, for existence strategies */
+ entries[i++] = PointerGetDatum(make_scalar_key(&v, JKEYELEM));
+ break;
+ case WJB_ELEM:
+ if (v.type == jbvString)
+ entries[i++] = PointerGetDatum(make_scalar_key(&v, JKEYELEM));
+ else
+ entries[i++] = PointerGetDatum(make_scalar_key(&v, JVAL));
+ break;
+ case WJB_VALUE:
+ entries[i++] = PointerGetDatum(make_scalar_key(&v, JVAL));
+ break;
+ default:
+ continue;
+ }
+ }
+
+ *nentries = i;
+
+ PG_RETURN_POINTER(entries);
+}
+
+Datum
+gin_extract_jsonb_query(PG_FUNCTION_ARGS)
+{
+ int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = PG_GETARG_UINT16(2);
+ int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
+ Datum *entries;
+
+ if (strategy == JsonbContainsStrategyNumber)
+ {
+ /* Query is a jsonb, so just apply gin_extract_jsonb... */
+ entries = (Datum *)
+ DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb,
+ PG_GETARG_DATUM(0),
+ PointerGetDatum(nentries)));
+ /* ...although "contains {}" requires a full index scan */
+ if (entries == NULL)
+ *searchMode = GIN_SEARCH_MODE_ALL;
+ }
+ else if (strategy == JsonbExistsStrategyNumber)
+ {
+ text *query = PG_GETARG_TEXT_PP(0);
+ text *item;
+
+ *nentries = 1;
+ entries = (Datum *) palloc(sizeof(Datum));
+ item = make_text_key(VARDATA_ANY(query), VARSIZE_ANY_EXHDR(query),
+ JKEYELEM);
+ entries[0] = PointerGetDatum(item);
+ }
+ else if (strategy == JsonbExistsAnyStrategyNumber ||
+ strategy == JsonbExistsAllStrategyNumber)
+ {
+ ArrayType *query = PG_GETARG_ARRAYTYPE_P(0);
+ Datum *key_datums;
+ bool *key_nulls;
+ int key_count;
+ int i,
+ j;
+ text *item;
+
+ deconstruct_array(query,
+ TEXTOID, -1, false, 'i',
+ &key_datums, &key_nulls, &key_count);
+
+ entries = (Datum *) palloc(sizeof(Datum) * key_count);
+
+ for (i = 0, j = 0; i < key_count; ++i)
+ {
+ /* Nulls in the array are ignored */
+ if (key_nulls[i])
+ continue;
+ item = make_text_key(VARDATA(key_datums[i]),
+ VARSIZE(key_datums[i]) - VARHDRSZ,
+ JKEYELEM);
+ entries[j++] = PointerGetDatum(item);
+ }
+
+ *nentries = j;
+ /* ExistsAll with no keys should match everything */
+ if (j == 0 && strategy == JsonbExistsAllStrategyNumber)
+ *searchMode = GIN_SEARCH_MODE_ALL;
+ }
+ else
+ {
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ entries = NULL; /* keep compiler quiet */
+ }
+
+ PG_RETURN_POINTER(entries);
+}
+
+Datum
+gin_consistent_jsonb(PG_FUNCTION_ARGS)
+{
+ bool *check = (bool *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
+
+ /* Jsonb *query = PG_GETARG_JSONB(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ bool *recheck = (bool *) PG_GETARG_POINTER(5);
+ bool res = true;
+ int32 i;
+
+ if (strategy == JsonbContainsStrategyNumber)
+ {
+ /*
+ * Index doesn't have information about correspondence of Jsonb keys
+ * and values (as distinct from GIN keys, which a key/value pair is
+ * stored as), so invariably we recheck. Besides, there are some
+ * special rules around the containment of raw scalar arrays and
+ * regular arrays that are not represented here. However, if all of
+ * the keys are not present, that's sufficient reason to return false
+ * and finish immediately.
+ */
+ *recheck = true;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (!check[i])
+ {
+ res = false;
+ break;
+ }
+ }
+ }
+ else if (strategy == JsonbExistsStrategyNumber)
+ {
+ /* Existence of key guaranteed in default search mode */
+ *recheck = false;
+ res = true;
+ }
+ else if (strategy == JsonbExistsAnyStrategyNumber)
+ {
+ /* Existence of key guaranteed in default search mode */
+ *recheck = false;
+ res = true;
+ }
+ else if (strategy == JsonbExistsAllStrategyNumber)
+ {
+ /* Testing for the presence of all keys gives an exact result */
+ *recheck = false;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (!check[i])
+ {
+ res = false;
+ break;
+ }
+ }
+ }
+ else
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+
+ PG_RETURN_BOOL(res);
+}
+
+Datum
+gin_triconsistent_jsonb(PG_FUNCTION_ARGS)
+{
+ GinLogicValue *check = (GinLogicValue *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
+ /* Jsonb *query = PG_GETARG_JSONB(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ GinLogicValue res = GIN_TRUE;
+
+ int32 i;
+
+ if (strategy == JsonbContainsStrategyNumber)
+ {
+ bool has_maybe = false;
+
+ /*
+ * All extracted keys must be present. Combination of GIN_MAYBE and
+ * GIN_TRUE gives GIN_MAYBE result because then all keys may be
+ * present.
+ */
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] == GIN_FALSE)
+ {
+ res = GIN_FALSE;
+ break;
+ }
+ if (check[i] == GIN_MAYBE)
+ {
+ res = GIN_MAYBE;
+ has_maybe = true;
+ }
+ }
+
+ /*
+ * Index doesn't have information about correspondence of Jsonb keys
+ * and values (as distinct from GIN keys, which a key/value pair is
+ * stored as), so invariably we recheck. This is also reflected in how
+ * GIN_MAYBE is given in response to there being no GIN_MAYBE input.
+ */
+ if (!has_maybe && res == GIN_TRUE)
+ res = GIN_MAYBE;
+ }
+ else if (strategy == JsonbExistsStrategyNumber ||
+ strategy == JsonbExistsAnyStrategyNumber)
+ {
+ /* Existence of key guaranteed in default search mode */
+ res = GIN_FALSE;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] == GIN_TRUE)
+ {
+ res = GIN_TRUE;
+ break;
+ }
+ if (check[i] == GIN_MAYBE)
+ {
+ res = GIN_MAYBE;
+ }
+ }
+ }
+ else if (strategy == JsonbExistsAllStrategyNumber)
+ {
+ /* Testing for the presence of all keys gives an exact result */
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] == GIN_FALSE)
+ {
+ res = GIN_FALSE;
+ break;
+ }
+ if (check[i] == GIN_MAYBE)
+ {
+ res = GIN_MAYBE;
+ }
+ }
+ }
+ else
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+
+ PG_RETURN_GIN_LOGIC_VALUE(res);
+}
+
+/*
+ *
+ * jsonb_hash_ops GIN opclass support functions
+ *
+ */
+Datum
+gin_consistent_jsonb_hash(PG_FUNCTION_ARGS)
+{
+ bool *check = (bool *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
+ /* Jsonb *query = PG_GETARG_JSONB(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ bool *recheck = (bool *) PG_GETARG_POINTER(5);
+ bool res = true;
+ int32 i;
+
+ if (strategy != JsonbContainsStrategyNumber)
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+
+ /*
+ * jsonb_hash_ops index doesn't have information about correspondence
+ * of Jsonb keys and values (as distinct from GIN keys, which a
+ * key/value pair is stored as), so invariably we recheck. Besides,
+ * there are some special rules around the containment of raw scalar
+ * arrays and regular arrays that are not represented here. However,
+ * if all of the keys are not present, that's sufficient reason to
+ * return false and finish immediately.
+ */
+ *recheck = true;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (!check[i])
+ {
+ res = false;
+ break;
+ }
+ }
+
+ PG_RETURN_BOOL(res);
+}
+
+Datum
+gin_triconsistent_jsonb_hash(PG_FUNCTION_ARGS)
+{
+ GinLogicValue *check = (GinLogicValue *) PG_GETARG_POINTER(0);
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
+ /* Jsonb *query = PG_GETARG_JSONB(2); */
+ int32 nkeys = PG_GETARG_INT32(3);
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ GinLogicValue res = GIN_TRUE;
+ int32 i;
+ bool has_maybe = false;
+
+ if (strategy != JsonbContainsStrategyNumber)
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+
+ /*
+ * All extracted keys must be present. A combination of GIN_MAYBE and
+ * GIN_TRUE induces a GIN_MAYBE result, because then all keys may be
+ * present.
+ */
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i] == GIN_FALSE)
+ {
+ res = GIN_FALSE;
+ break;
+ }
+ if (check[i] == GIN_MAYBE)
+ {
+ res = GIN_MAYBE;
+ has_maybe = true;
+ }
+ }
+
+ /*
+ * jsonb_hash_ops index doesn't have information about correspondence of
+ * Jsonb keys and values (as distinct from GIN keys, which for this opclass
+ * are a hash of a pair, or a hash of just an element), so invariably we
+ * recheck. This is also reflected in how GIN_MAYBE is given in response
+ * to there being no GIN_MAYBE input.
+ */
+ if (!has_maybe && res == GIN_TRUE)
+ res = GIN_MAYBE;
+
+ PG_RETURN_GIN_LOGIC_VALUE(res);
+}
+
+Datum
+gin_extract_jsonb_hash(PG_FUNCTION_ARGS)
+{
+ Jsonb *jb = PG_GETARG_JSONB(0);
+ int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ int total = 2 * JB_ROOT_COUNT(jb);
+ JsonbIterator *it;
+ JsonbValue v;
+ PathHashStack tail;
+ PathHashStack *stack;
+ int i = 0,
+ r;
+ Datum *entries = NULL;
+
+ if (total == 0)
+ {
+ *nentries = 0;
+ PG_RETURN_POINTER(NULL);
+ }
+
+ entries = (Datum *) palloc(sizeof(Datum) * total);
+
+ it = JsonbIteratorInit(VARDATA(jb));
+
+ tail.parent = NULL;
+ tail.hash = 0;
+ stack = &tail;
+
+ while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
+ {
+ PathHashStack *tmp;
+
+ if (i >= total)
+ {
+ total *= 2;
+ entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
+ }
+
+ switch (r)
+ {
+ case WJB_BEGIN_ARRAY:
+ case WJB_BEGIN_OBJECT:
+ tmp = stack;
+ stack = (PathHashStack *) palloc(sizeof(PathHashStack));
+
+ /*
+ * Nesting an array within another array will not alter
+ * innermost scalar element hash values, but that seems
+ * inconsequential
+ */
+ if (tmp->parent)
+ {
+ /*
+ * We pass forward hashes from previous container nesting
+ * levels so that nested arrays with an outermost nested
+ * object will have element hashes mixed with the outermost
+ * key. It's also somewhat useful to have nested objects
+ * innermost values have hashes that are a function of not
+ * just their own key, but outer keys too.
+ */
+ stack->hash = tmp->hash;
+ }
+ else
+ {
+ /*
+ * At least nested level, initialize with stable container
+ * type proxy value
+ */
+ stack->hash = (r == WJB_BEGIN_ARRAY)? JB_FARRAY:JB_FOBJECT;
+ }
+ stack->parent = tmp;
+ break;
+ case WJB_KEY:
+ /* Initialize hash from parent */
+ stack->hash = stack->parent->hash;
+ JsonbHashScalarValue(&v, &stack->hash);
+ break;
+ case WJB_ELEM:
+ /* Elements have parent hash mixed in separately */
+ stack->hash = stack->parent->hash;
+ case WJB_VALUE:
+ /* Element/value case */
+ JsonbHashScalarValue(&v, &stack->hash);
+ entries[i++] = stack->hash;
+ break;
+ case WJB_END_ARRAY:
+ case WJB_END_OBJECT:
+ /* Pop the stack */
+ tmp = stack->parent;
+ pfree(stack);
+ stack = tmp;
+ break;
+ default:
+ elog(ERROR, "invalid JsonbIteratorNext rc: %d", r);
+ }
+ }
+
+ *nentries = i;
+
+ PG_RETURN_POINTER(entries);
+}
+
+Datum
+gin_extract_jsonb_query_hash(PG_FUNCTION_ARGS)
+{
+ int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = PG_GETARG_UINT16(2);
+ int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
+ Datum *entries;
+
+ if (strategy != JsonbContainsStrategyNumber)
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+
+ /* Query is a jsonb, so just apply gin_extract_jsonb... */
+ entries = (Datum *)
+ DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb_hash,
+ PG_GETARG_DATUM(0),
+ PointerGetDatum(nentries)));
+
+ /* ...although "contains {}" requires a full index scan */
+ if (entries == NULL)
+ *searchMode = GIN_SEARCH_MODE_ALL;
+
+ PG_RETURN_POINTER(entries);
+}
+
+/*
+ * Build a text value from a cstring and flag suitable for storage as a key
+ * value
+ */
+static text *
+make_text_key(const char *str, int len, char flag)
+{
+ text *item;
+
+ item = (text *) palloc(VARHDRSZ + len + 1);
+ SET_VARSIZE(item, VARHDRSZ + len + 1);
+
+ *VARDATA(item) = flag;
+
+ memcpy(VARDATA(item) + 1, str, len);
+
+ return item;
+}
+
+/*
+ * Create a textual representation of a jsonbValue for GIN storage.
+ */
+static text *
+make_scalar_key(const JsonbValue * scalarVal, char flag)
+{
+ text *item;
+ char *cstr;
+
+ switch (scalarVal->type)
+ {
+ case jbvNull:
+ item = make_text_key("n", 1, flag);
+ break;
+ case jbvBool:
+ item = make_text_key(scalarVal->boolean ? "t" : "f", 1, flag);
+ break;
+ case jbvNumeric:
+ /*
+ * A normalized textual representation, free of trailing zeroes is
+ * is required.
+ *
+ * It isn't ideal that numerics are stored in a relatively bulky
+ * textual format. However, it's a notationally convenient way of
+ * storing a "union" type in the GIN B-Tree, and indexing Jsonb
+ * strings takes precedence.
+ */
+ cstr = numeric_normalize(scalarVal->numeric);
+ item = make_text_key(cstr, strlen(cstr), flag);
+ pfree(cstr);
+ break;
+ case jbvString:
+ item = make_text_key(scalarVal->string.val, scalarVal->string.len,
+ flag);
+ break;
+ default:
+ elog(ERROR, "invalid jsonb scalar type");
+ }
+
+ return item;
+}