begriffs open source - ai-pg/blob - full-docs/txt/xtypes.txt

   1
   2 36.13. User-Defined Types #
   3
   4    36.13.1. TOAST Considerations
   5
   6    As described in Section 36.2, PostgreSQL can be extended to support new
   7    data types. This section describes how to define new base types, which
   8    are data types defined below the level of the SQL language. Creating a
   9    new base type requires implementing functions to operate on the type in
  10    a low-level language, usually C.
  11
  12    The examples in this section can be found in complex.sql and complex.c
  13    in the src/tutorial directory of the source distribution. See the
  14    README file in that directory for instructions about running the
  15    examples.
  16
  17    A user-defined type must always have input and output functions. These
  18    functions determine how the type appears in strings (for input by the
  19    user and output to the user) and how the type is organized in memory.
  20    The input function takes a null-terminated character string as its
  21    argument and returns the internal (in memory) representation of the
  22    type. The output function takes the internal representation of the type
  23    as argument and returns a null-terminated character string. If we want
  24    to do anything more with the type than merely store it, we must provide
  25    additional functions to implement whatever operations we'd like to have
  26    for the type.
  27
  28    Suppose we want to define a type complex that represents complex
  29    numbers. A natural way to represent a complex number in memory would be
  30    the following C structure:
  31 typedef struct Complex {
  32     double      x;
  33     double      y;
  34 } Complex;
  35
  36    We will need to make this a pass-by-reference type, since it's too
  37    large to fit into a single Datum value.
  38
  39    As the external string representation of the type, we choose a string
  40    of the form (x,y).
  41
  42    The input and output functions are usually not hard to write,
  43    especially the output function. But when defining the external string
  44    representation of the type, remember that you must eventually write a
  45    complete and robust parser for that representation as your input
  46    function. For instance:
  47 PG_FUNCTION_INFO_V1(complex_in);
  48
  49 Datum
  50 complex_in(PG_FUNCTION_ARGS)
  51 {
  52     char       *str = PG_GETARG_CSTRING(0);
  53     double      x,
  54                 y;
  55     Complex    *result;
  56
  57     if (sscanf(str, " ( %lf , %lf )", &x, &y) != 2)
  58         ereport(ERROR,
  59                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
  60                  errmsg("invalid input syntax for type %s: \"%s\"",
  61                         "complex", str)));
  62
  63     result = (Complex *) palloc(sizeof(Complex));
  64     result->x = x;
  65     result->y = y;
  66     PG_RETURN_POINTER(result);
  67 }
  68
  69
  70    The output function can simply be:
  71 PG_FUNCTION_INFO_V1(complex_out);
  72
  73 Datum
  74 complex_out(PG_FUNCTION_ARGS)
  75 {
  76     Complex    *complex = (Complex *) PG_GETARG_POINTER(0);
  77     char       *result;
  78
  79     result = psprintf("(%g,%g)", complex->x, complex->y);
  80     PG_RETURN_CSTRING(result);
  81 }
  82
  83
  84    You should be careful to make the input and output functions inverses
  85    of each other. If you do not, you will have severe problems when you
  86    need to dump your data into a file and then read it back in. This is a
  87    particularly common problem when floating-point numbers are involved.
  88
  89    Optionally, a user-defined type can provide binary input and output
  90    routines. Binary I/O is normally faster but less portable than textual
  91    I/O. As with textual I/O, it is up to you to define exactly what the
  92    external binary representation is. Most of the built-in data types try
  93    to provide a machine-independent binary representation. For complex, we
  94    will piggy-back on the binary I/O converters for type float8:
  95 PG_FUNCTION_INFO_V1(complex_recv);
  96
  97 Datum
  98 complex_recv(PG_FUNCTION_ARGS)
  99 {
 100     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
 101     Complex    *result;
 102
 103     result = (Complex *) palloc(sizeof(Complex));
 104     result->x = pq_getmsgfloat8(buf);
 105     result->y = pq_getmsgfloat8(buf);
 106     PG_RETURN_POINTER(result);
 107 }
 108
 109 PG_FUNCTION_INFO_V1(complex_send);
 110
 111 Datum
 112 complex_send(PG_FUNCTION_ARGS)
 113 {
 114     Complex    *complex = (Complex *) PG_GETARG_POINTER(0);
 115     StringInfoData buf;
 116
 117     pq_begintypsend(&buf);
 118     pq_sendfloat8(&buf, complex->x);
 119     pq_sendfloat8(&buf, complex->y);
 120     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 121 }
 122
 123
 124    Once we have written the I/O functions and compiled them into a shared
 125    library, we can define the complex type in SQL. First we declare it as
 126    a shell type:
 127 CREATE TYPE complex;
 128
 129    This serves as a placeholder that allows us to reference the type while
 130    defining its I/O functions. Now we can define the I/O functions:
 131 CREATE FUNCTION complex_in(cstring)
 132     RETURNS complex
 133     AS 'filename'
 134     LANGUAGE C IMMUTABLE STRICT;
 135
 136 CREATE FUNCTION complex_out(complex)
 137     RETURNS cstring
 138     AS 'filename'
 139     LANGUAGE C IMMUTABLE STRICT;
 140
 141 CREATE FUNCTION complex_recv(internal)
 142    RETURNS complex
 143    AS 'filename'
 144    LANGUAGE C IMMUTABLE STRICT;
 145
 146 CREATE FUNCTION complex_send(complex)
 147    RETURNS bytea
 148    AS 'filename'
 149    LANGUAGE C IMMUTABLE STRICT;
 150
 151    Finally, we can provide the full definition of the data type:
 152 CREATE TYPE complex (
 153    internallength = 16,
 154    input = complex_in,
 155    output = complex_out,
 156    receive = complex_recv,
 157    send = complex_send,
 158    alignment = double
 159 );
 160
 161    When you define a new base type, PostgreSQL automatically provides
 162    support for arrays of that type. The array type typically has the same
 163    name as the base type with the underscore character (_) prepended.
 164
 165    Once the data type exists, we can declare additional functions to
 166    provide useful operations on the data type. Operators can then be
 167    defined atop the functions, and if needed, operator classes can be
 168    created to support indexing of the data type. These additional layers
 169    are discussed in following sections.
 170
 171    If the internal representation of the data type is variable-length, the
 172    internal representation must follow the standard layout for
 173    variable-length data: the first four bytes must be a char[4] field
 174    which is never accessed directly (customarily named vl_len_). You must
 175    use the SET_VARSIZE() macro to store the total size of the datum
 176    (including the length field itself) in this field and VARSIZE() to
 177    retrieve it. (These macros exist because the length field may be
 178    encoded depending on platform.)
 179
 180    For further details see the description of the CREATE TYPE command.
 181
 182 36.13.1. TOAST Considerations #
 183
 184    If the values of your data type vary in size (in internal form), it's
 185    usually desirable to make the data type TOAST-able (see Section 66.2).
 186    You should do this even if the values are always too small to be
 187    compressed or stored externally, because TOAST can save space on small
 188    data too, by reducing header overhead.
 189
 190    To support TOAST storage, the C functions operating on the data type
 191    must always be careful to unpack any toasted values they are handed by
 192    using PG_DETOAST_DATUM. (This detail is customarily hidden by defining
 193    type-specific GETARG_DATATYPE_P macros.) Then, when running the CREATE
 194    TYPE command, specify the internal length as variable and select some
 195    appropriate storage option other than plain.
 196
 197    If data alignment is unimportant (either just for a specific function
 198    or because the data type specifies byte alignment anyway) then it's
 199    possible to avoid some of the overhead of PG_DETOAST_DATUM. You can use
 200    PG_DETOAST_DATUM_PACKED instead (customarily hidden by defining a
 201    GETARG_DATATYPE_PP macro) and using the macros VARSIZE_ANY_EXHDR and
 202    VARDATA_ANY to access a potentially-packed datum. Again, the data
 203    returned by these macros is not aligned even if the data type
 204    definition specifies an alignment. If the alignment is important you
 205    must go through the regular PG_DETOAST_DATUM interface.
 206
 207 Note
 208
 209    Older code frequently declares vl_len_ as an int32 field instead of
 210    char[4]. This is OK as long as the struct definition has other fields
 211    that have at least int32 alignment. But it is dangerous to use such a
 212    struct definition when working with a potentially unaligned datum; the
 213    compiler may take it as license to assume the datum actually is
 214    aligned, leading to core dumps on architectures that are strict about
 215    alignment.
 216
 217    Another feature that's enabled by TOAST support is the possibility of
 218    having an expanded in-memory data representation that is more
 219    convenient to work with than the format that is stored on disk. The
 220    regular or “flat” varlena storage format is ultimately just a blob of
 221    bytes; it cannot for example contain pointers, since it may get copied
 222    to other locations in memory. For complex data types, the flat format
 223    may be quite expensive to work with, so PostgreSQL provides a way to
 224    “expand” the flat format into a representation that is more suited to
 225    computation, and then pass that format in-memory between functions of
 226    the data type.
 227
 228    To use expanded storage, a data type must define an expanded format
 229    that follows the rules given in src/include/utils/expandeddatum.h, and
 230    provide functions to “expand” a flat varlena value into expanded format
 231    and “flatten” the expanded format back to the regular varlena
 232    representation. Then ensure that all C functions for the data type can
 233    accept either representation, possibly by converting one into the other
 234    immediately upon receipt. This does not require fixing all existing
 235    functions for the data type at once, because the standard
 236    PG_DETOAST_DATUM macro is defined to convert expanded inputs into
 237    regular flat format. Therefore, existing functions that work with the
 238    flat varlena format will continue to work, though slightly
 239    inefficiently, with expanded inputs; they need not be converted until
 240    and unless better performance is important.
 241
 242    C functions that know how to work with an expanded representation
 243    typically fall into two categories: those that can only handle expanded
 244    format, and those that can handle either expanded or flat varlena
 245    inputs. The former are easier to write but may be less efficient
 246    overall, because converting a flat input to expanded form for use by a
 247    single function may cost more than is saved by operating on the
 248    expanded format. When only expanded format need be handled, conversion
 249    of flat inputs to expanded form can be hidden inside an
 250    argument-fetching macro, so that the function appears no more complex
 251    than one working with traditional varlena input. To handle both types
 252    of input, write an argument-fetching function that will detoast
 253    external, short-header, and compressed varlena inputs, but not expanded
 254    inputs. Such a function can be defined as returning a pointer to a
 255    union of the flat varlena format and the expanded format. Callers can
 256    use the VARATT_IS_EXPANDED_HEADER() macro to determine which format
 257    they received.
 258
 259    The TOAST infrastructure not only allows regular varlena values to be
 260    distinguished from expanded values, but also distinguishes “read-write”
 261    and “read-only” pointers to expanded values. C functions that only need
 262    to examine an expanded value, or will only change it in safe and
 263    non-semantically-visible ways, need not care which type of pointer they
 264    receive. C functions that produce a modified version of an input value
 265    are allowed to modify an expanded input value in-place if they receive
 266    a read-write pointer, but must not modify the input if they receive a
 267    read-only pointer; in that case they have to copy the value first,
 268    producing a new value to modify. A C function that has constructed a
 269    new expanded value should always return a read-write pointer to it.
 270    Also, a C function that is modifying a read-write expanded value
 271    in-place should take care to leave the value in a sane state if it
 272    fails partway through.
 273
 274    For examples of working with expanded values, see the standard array
 275    infrastructure, particularly src/backend/utils/adt/array_expanded.c.