reggenomedocumentmetadata.schema.json
Schema describing every property RegGenome emits for a corpus document in all-documents.json. Derived by walking 8,865 records on 2026-06-09. Source file: Nexus/06 – Engineering/Data Files/all-documents.json. Every record is a flat object at the top level (no nested arrays of records). All fields except `processed` and the deeper sector levels are present in 100% of records.
type · thingRegGenome vocabularystatus · draftv1
{
"Reggenomedocumentmetadata": {
"@type": "Reggenomedocumentmetadata",
"schemaVersion": 1,
"document_id": {
"@type": "Document_id"
},
"title": "String",
"source_urls": [
{
"source_urls(item)": "URL"
}
],
"publisher": {
"@type": "Publisher",
"id": "String",
"name": "String",
"jurisdiction": "String"
},
"doctype": "String",
"published": "Datetime",
"last_updated": "Datetime",
"crawled": "Datetime",
"processed": "Datetime",
"sector": {
"@type": "Sector",
"level_1": "String",
"level_2": "String",
"level_3": "String",
"level_4": "String"
},
"cyber_score": "Decimal",
"authoritative": [
{
"authoritative(item)": "String"
}
],
"content_type": "String",
"restriction": "String",
"copyright_notice": "String"
}
}{
"Reggenomedocumentmetadata": {
"@type": "Reggenomedocumentmetadata",
"schemaVersion": 1,
"description": "Schema describing every property RegGenome emits for a corpus document in all-documents.json. Derived by walking 8,865 records on 2026-06-09. Source file: Nexus/06 – Engineering/Data Files/all-documents.json. Every record is a flat object at the top level (no nested arrays of records). All fields except `processed` and the deeper sector levels are present in 100% of records.",
"document_id": {
"@type": "Document_id",
"description": "Stable RegGenome identifier. Acts as the corpus-wide primary key and is the basename of the original-format file (e.g., {document_id}.pdf) and of the markdown conversion ({document_id}.md)."
},
"title": {
"description": "Document title as published. May carry leading whitespace or trailing newlines — normalize before bibliographic use. Federal Register documents prefix with 'FR Doc. {year}-{number}:'.",
"type": "String"
},
"source_urls": [
{
"description": "Authoritative URLs for the document at its publisher's site. Always exactly one element in the observed corpus. This is the canonical URL — use it as the BibTeX `url` field, NOT the package storage URL.",
"source_urls(item)": {
"type": "URL"
}
}
],
"publisher": {
"@type": "Publisher",
"description": "The issuing authority. The same publisher may have many `id` aliases across the corpus, but `name` is stable.",
"id": {
"description": "Slug identifier, kebab-case. Useful for filtering and joining.",
"type": "String"
},
"name": {
"description": "The name of the item or record.",
"type": "String"
},
"jurisdiction": {
"description": "ISO 3166-1 alpha-2 country code, plus 'International Standard-Setters' for cross-border bodies (IOSCO, BIS, FATF, etc.). Allowed values: AU, CA, DE, GB, International Standard-Setters, NZ, US.",
"type": "String"
}
},
"doctype": {
"description": "RegGenome's controlled vocabulary for document genre. Drives BibTeX entry-type selection (Guidance/Research/Technical Standards → @techreport when publisher is NIST/CISA/etc.; everything else → @misc). Allowed values: Codified rulebooks, Consultation Responses Received, Consultations and Calls for Input, Corporate Actions, Operations and Reporting, Firm-level actions and enforcement, Guidance and FAQs, Individual Guidance, Laws, Legislative Codes, Lists, Forms and Tables, Other, Policies and Decisions, … (16 total).",
"type": "String"
},
"published": {
"description": "ISO 8601 timestamp of original publication. The year portion drives the BibTeX `year` field.",
"type": "Datetime"
},
"last_updated": {
"description": "ISO 8601. RegGenome's last metadata refresh of this record (not necessarily the document's last revision).",
"type": "Datetime"
},
"crawled": {
"description": "ISO 8601. When RegGenome fetched the source URL. Use as BibTeX `urldate`.",
"type": "Datetime"
},
"processed": {
"description": "ISO 8601. When RegGenome's pipeline last reprocessed the record. Null for 29 records (0.3%).",
"nullable": true,
"type": "Datetime"
},
"sector": {
"@type": "Sector",
"description": "Four-level industry classification. Level 1 is always populated; lower levels narrow progressively and are null when the document is broader than that level.",
"level_1": {
"description": "Top-level industry. 18 distinct values across the corpus.",
"nullable": true,
"type": "String"
},
"level_2": {
"description": "Mid-tier sub-industry. Null for 33.8% of records.",
"nullable": true,
"type": "String"
},
"level_3": {
"description": "Narrow sub-industry. Null for 38.1%.",
"nullable": true,
"type": "String"
},
"level_4": {
"description": "Most specific sub-industry. Null for 49.6%.",
"nullable": true,
"type": "String"
}
},
"cyber_score": {
"description": "RegGenome's cyber-relevance signal, 0.0–1.0 (higher = more cyber-relevant). Source heuristic not documented by RegGenome. Useful for downstream filtering when building cyber-only corpora.",
"type": "Decimal"
},
"authoritative": [
{
"description": "Tags marking the document as authoritative on a topic. Empty in 99.88% of records; when populated, the only observed value is 'cyber' (11 records). Treat as a hint, not a primary filter.",
"authoritative(item)": {
"description": "Allowed values: cyber.",
"type": "String"
}
}
],
"content_type": {
"description": "MIME type of the original file. Constant across the observed corpus. Allowed values: application/pdf.",
"type": "String"
},
"restriction": {
"description": "Distribution restriction. Constant 'unrestricted' across the observed corpus. Allowed values: unrestricted.",
"type": "String"
},
"copyright_notice": {
"description": "Free-text copyright line as published. Often includes the publisher and year.",
"type": "String"
}
}
}