templates/metadata/metaschema/metaschema.1.schema.json

{
  "$schema": "http://json-schema.org/draft-04/schema#",
  "id": "http://jsonschema.net",
  "type": "object",
  "description": "Metaschema for validating the structure of the per-doctype schemas in this repository",
  "properties": {
    "mozPipelineMetadata": {
      "type": "object",
      "description": "Container for per-doctype metadata that can affect how pings are processed in the pipeline",
      "additionalProperties": false,
      "properties": {
        "$comment": {
          "type": "string",
          "description": "Optional comment about the pipeline handling for this doctype"
        },
        "bq_table": {
          "type": "string",
          "description": "NOT YET IMPLEMENTED: The name of the destination BigQuery table"
        },
        "bq_dataset_family": {
          "type": "string",
          "description": "NOT YET IMPLEMENTED: The base name for the destination BigQuery dataset; if this is set to 'telemetry', the pipeline will write into 'telemetry_live'"
        },
        "bq_metadata_format": {
          "type": "string",
          "description": "The logical format for the metadata struct in the destination BigQuery table",
          "enum": ["structured", "telemetry"]
        },
        "submission_timestamp_granularity": {
          "type": "string",
          "description": "If specified, the submission_timestamp field will be truncated to the specified granularity in the pipeline before being output to BigQuery; this can be used to reduce the potential for using time-based attacks to correlate datasets using different client-level identifiers; see Java's ChronoUnit for additional granularities that could be considered for inclusion; implemented for bug 1742172",
            "enum": ["millis", "seconds", "minutes", "hours", "days"]
        },
        "expiration_policy": {
          "type": "object",
          "description": "Various options controlling data lifecycle",
          "additionalProperties": false,
          "properties": {
            "delete_after_days": {
              "type": "integer",
              "description": "If present, a time_partitioning_expiration policy will be set on the destination stable table in BigQuery"
            },
            "collect_through_date": {
              "type": "string",
              "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
              "description": "If present, the pipeline will reject new data with submission_timestamp after the given date, sending it to error output"
            }
          }
        },
        "include_client_id": {
          "type": "boolean",
          "description": "Glean ping property that determines whether a client id is sent in the ping."
        },
        "include_info_sections": {
          "type": "boolean",
          "description": "Glean ping property that determines whether info sections are sent, e.g. client_info, ping_info."
        },
        "override_attributes": {
          "type": "array",
          "description": "Mappings of Pub/Sub attribute names to static values; these are applied in the Decoder immediately before incorporating metadata into the payload, so can be used to overwrite values calculated in the pipeline; a null value will cause the pipeline to drop the named attribute; some attribute names differ from the nested metadata format in BigQuery, so for example you must use \"geo_city\" here in order to manipulate the value that shows up as metadata.geo.city; implemented for bug 1742172",
          "items": {
            "type": "object",
            "additionalProperties": false,
            "properties": {
              "name":{
                "type": "string",
                "enum": [
                  "geo_city",
                  "geo_subdivision1",
                  "geo_subdivision2",
                  "normalized_channel"
                ]
              },
              "value":{
                "type": ["string", "null"]
              }
            },
            "required": [
              "name",
              "value"
            ]
          }
        },
        "geoip_skip_entries": {
          "description": "If present, how many additional entries (beyond two) to skip in x_forwarded_for when performing geoip decoding, useful when submissions are ingested from trusted proxies; if there are fewer entries in x_forwarded_for than (N+1) the last entry is used instead of (N+3)rd-to-last",
          "type": "integer"
        },
        "jwe_mappings": {
          "type": "array",
          "description": "Mappings of encrypted JWE field paths to destinations where the value decrypted by the pipeline should be placed; initial use case is Account Ecosystem Telemetry; paths must be in [JSON Pointer format](https://tools.ietf.org/html/rfc6901) like '/payload/ecosystemAnonId'",
          "items": {
            "type": "object",
            "additionalProperties": false,
            "properties": {
              "source_field_path":{
                "type": "string",
                "pattern": "^/.*$"
              },
              "decrypted_field_path":{
                "type": "string",
                "pattern": "^/.*$"
              }
            },
            "required": [
              "source_field_path",
              "decrypted_field_path"
            ]
          }
        },
        "sample_id_source_uuid_attribute": {
          "description": "If specified, sample_id will be calculated from a hash of the specified attribute if it is a valid UUID after removing curly brackets; if neither this nor sample_id_source_uuid_payload_path are specified, the client_id attribute will be used; implemented for DENG-547",
          "type": "string"
        },
        "sample_id_source_uuid_payload_path": {
          "description": "If specified and sample_id was not set due to sample_id_source_uuid_attribute, then sample_id will be calculated from a hash of the specified payload path; this is specified as a list strings to allow indicating nested fields, so for example metrics.uuid.legacy_ids_client_id would be specified as [\"metrics\",\"uuid\",\"legacy_ids_client_id\"]; if neither this nor sample_id_source_uuid_attribute are specified, the client_id attribute will be used; implemented for DENG-547",
          "items": {
            "type": "string"
          },
          "type": "array"
        },
        "split_config": {
          "type": "object",
          "description": "Configuration for splitting a ping into multiple pings by field",
          "additionalProperties": false,
          "properties": {
            "preserve_original": {
              "type": "boolean",
              "description": "Whether or not to output the unmodified original ping in addition to any generated pings."
            },
            "remainder": {
              "type": "object",
              "description": "If present, generate a ping containing all fields not included in any subset ping.",
              "additionalProperties": false,
              "properties": {
                "document_namespace": {
                  "type": "string"
                },
                "document_type": {
                  "type": "string"
                },
                "document_version": {
                  "type": "string"
                }
              },
              "required": [
                "document_namespace",
                "document_type",
                "document_version"
              ]
            },
            "subsets": {
              "type": "array",
              "description": "Array of subset pings to generate.",
              "items": {
                "type": "object",
                "description": "Configuration for generating a ping that is a subset of fields from the original ping.",
                "additionalProperties": false,
                "properties": {
                  "document_namespace": {
                    "type": "string"
                  },
                  "document_type": {
                    "type": "string"
                  },
                  "document_version": {
                    "type": "string"
                  },
                  "pattern": {
                    "type": "string",
                    "description": "Regular expression matching .-delimited property names that should be moved to this subset ping. Only properties explictly defined in the non-generic json schema of the original ping are supported, because property names are matched during schema generation."
                  },
                  "extra_pattern": {
                    "type": "string",
                    "description": "Like pattern, except the schema of matched properties must also be present in the remainder, because schemas cannot delete fields. Data for matched properties will only go to this subset ping."
                  }
                },
                "required": [
                  "document_namespace",
                  "document_type",
                  "document_version",
                  "pattern"
                ]
              }
            }
          },
          "required": [
            "preserve_original",
            "subsets"
          ]
        },
        "json_object_path_regex": {
          "description": "The path for which a JSON column will be enforced. This should be a regular expression which is used by the jsonschema-transpiler to match against the fully qualified name of a metric",
          "type": "string"
        }
      }
    }
  }
}