diff --git a/README.md b/README.md index 3add90f6..3de1f5e8 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,74 @@ install packages api key: 12345 See [discussion here](https://github.com/SuffolkLITLab/docassemble-AssemblyLine/issues/69) +# Answer Set Import Safety Configuration + +Answer set JSON imports are intentionally restricted to reduce risk from malformed and malicious payloads. + +Default behavior: +- Plain JSON values are imported by default, and object reconstruction is allowed only for allowlisted DAObject classes. +- Top-level variable names must match `^[A-Za-z][A-Za-z0-9_]*$`. +- Internal/protected variable names are blocked. +- If `answer set import allowed variables` is not set, imports use a denylist-only policy for backwards compatibility. +- Object payloads can be imported when classes are allowlisted; by default, known `docassemble.base` and `docassemble.AssemblyLine` DAObject descendants are allowed. + +Default import limits (`assembly line: answer set import limits`): +- `max bytes`: `1048576` (1 MB) +- `max depth`: `40` +- `max keys`: `20000` +- `max list items`: `5000` +- `max string length`: `200000` +- `max number abs`: `1000000000000000` (`10**15`) + +Final allowlist/config policy: +- Default allowlist: unset (`answer set import allowed variables` omitted), to avoid breaking existing interviews unexpectedly. +- Recommended production policy: set an explicit allowlist to only shared/reusable variables in your jurisdiction. +- `answer set import allow objects` defaults to `true`; set it to `false` if you want strict plain-JSON-only imports. +- `answer set import allowed object classes` can extend the default DAObject class allowlist with explicit additional class paths. +- Additional classes in `answer set import allowed object classes` apply to object envelopes at any depth (top-level variables and nested descendants). +- `answer set import remap known classes` defaults to `true`; this safely maps known class basenames from other packages (such as playground exports) onto official allowlisted classes. +- `answer set import class remap` can define explicit basename-to-class mappings for additional controlled remaps. + +Example hardened configuration: + +```yaml +assembly line: + enable answer sets: true + enable answer set imports: true + answer set import require signed: false + answer set import allow objects: true + answer set import remap known classes: true + answer set import limits: + max bytes: 1048576 + max depth: 40 + max keys: 20000 + max list items: 5000 + max string length: 200000 + max number abs: 1000000000000000 + answer set import allowed variables: + - users_name + - users_address + - users_phone_number + - users_email + - household_size + answer set import allowed object classes: + - docassemble.AssemblyLine.al_general.ALIndividual + - docassemble.AssemblyLine.al_general.ALPeopleList + - docassemble.AssemblyLine.al_general.ALAddress + answer set import class remap: + ALIndividual: docassemble.AssemblyLine.al_general.ALIndividual + ALPeopleList: docassemble.AssemblyLine.al_general.ALPeopleList +``` + +Notes: +- Keeping `answer set import require signed: false` matches current compatibility-first behavior; unsigned imports still pass strict structural validation. +- If your environment can manage signing keys, set `answer set import require signed: true` to require signed payloads. +- Class allowlisting uses full dotted class names (exact match), not wildcard patterns. +- Playground-authored classes usually need explicit allowlisting, e.g. `docassemble.playground1.al_general.ALIndividual`. +- If a playground package name changes across environments (for example `playground1` to `playground2`), update `answer set import allowed object classes` to match the runtime class path. +- With `answer set import remap known classes: true`, exports that use known class basenames (for example `docassemble.playground1.al_general.ALIndividual`) can be remapped to official allowlisted classes without instantiating the playground class. + + # ALDocument class ## Purpose diff --git a/docassemble/AssemblyLine/data/questions/al_document.yml b/docassemble/AssemblyLine/data/questions/al_document.yml index 23638d50..f14d9cfd 100644 --- a/docassemble/AssemblyLine/data/questions/al_document.yml +++ b/docassemble/AssemblyLine/data/questions/al_document.yml @@ -34,9 +34,9 @@ code: | key=action_argument("key"), preferred_formats=preferred_formats, ) - email_arg = action_argument('email') + email_arg = action_argument("email") if isinstance(email_arg, list): - email_str = ', '.join(email_arg) + email_str = ", ".join(email_arg) else: email_str = str(email_arg) if email_success: @@ -72,9 +72,9 @@ code: | key=action_argument("key"), preferred_formats=preferred_formats, ) - email_arg = action_argument('email') + email_arg = action_argument("email") if isinstance(email_arg, list): - email_str = ', '.join(email_arg) + email_str = ", ".join(email_arg) else: email_str = str(email_arg) if email_success: diff --git a/docassemble/AssemblyLine/data/questions/al_saved_sessions.yml b/docassemble/AssemblyLine/data/questions/al_saved_sessions.yml index 846df76d..dce14f3f 100644 --- a/docassemble/AssemblyLine/data/questions/al_saved_sessions.yml +++ b/docassemble/AssemblyLine/data/questions/al_saved_sessions.yml @@ -3,6 +3,7 @@ code: | # HACK # Create a placeholder value to avoid playground errors al_sessions_snapshot_results = DAEmpty() + al_sessions_last_import_report = {"accepted": [], "rejected": [], "warnings": []} --- initial: True code: | @@ -217,8 +218,6 @@ back button: False --- id: al sessions load status continue button field: al_sessions_load_status -comment: | - #TODO There's no error handling yet so this might be a lie question: | % if al_sessions_snapshot_results: Your answer set was loaded @@ -226,9 +225,32 @@ question: | Your answer set was not loaded. You can try again. % endif subquestion: | + % if defined('al_sessions_last_import_report'): + % if al_sessions_last_import_report.get('warnings'): + ${ collapse_template(al_sessions_import_warnings_template) } + % endif + % if al_sessions_last_import_report.get('rejected'): + ${ collapse_template(al_sessions_import_rejected_template) } + % endif + % endif + Tap "next" to keep answering any unanswered questions and finish the interview. back button: False --- +template: al_sessions_import_warnings_template +subject: Import warnings +content: | + % for warning in al_sessions_last_import_report.get('warnings', []): + * ${ warning } + % endfor +--- +template: al_sessions_import_rejected_template +subject: Variables skipped during import +content: | + % for item in al_sessions_last_import_report.get('rejected', []): + * `${ item.get('path', '?') }`: ${ item.get('reason', 'unknown reason') } + % endfor +--- question: | Upload a JSON file subquestion: | @@ -239,11 +261,9 @@ fields: accept: | "application/json, text/json, text/*, .json" validation code: | - try: - json.loads(al_sessions_json_file.slurp()) - except: - validation_error("Upload a file with valid JSON") + is_valid_json(al_sessions_json_file.slurp()) --- code: | al_sessions_snapshot_results = load_interview_json(al_sessions_json_file.slurp()) + al_sessions_last_import_report = get_last_import_report() al_sessions_import_json = True \ No newline at end of file diff --git a/docassemble/AssemblyLine/data/questions/al_settings.yml b/docassemble/AssemblyLine/data/questions/al_settings.yml index 31967096..e1800521 100644 --- a/docassemble/AssemblyLine/data/questions/al_settings.yml +++ b/docassemble/AssemblyLine/data/questions/al_settings.yml @@ -119,4 +119,10 @@ code: | --- code: | # Can be an exact path or just a name, in which case we will search /usr/share/fonts and /var/www/.fonts for a matching file ending in .ttf - al_typed_signature_font = "/usr/share/fonts/truetype/google-fonts/BadScript-Regular.ttf" \ No newline at end of file + al_typed_signature_font = "/usr/share/fonts/truetype/google-fonts/BadScript-Regular.ttf" +--- +code: | + # Allow users to import answer sets from JSON files. + # The global config 'enable answer set imports' is checked first; this variable allows + # interview authors to disable imports at the interview level even if global config permits them. + al_allow_answer_set_imports = True \ No newline at end of file diff --git a/docassemble/AssemblyLine/data/questions/al_visual.yml b/docassemble/AssemblyLine/data/questions/al_visual.yml index 32324626..f4aab12b 100644 --- a/docassemble/AssemblyLine/data/questions/al_visual.yml +++ b/docassemble/AssemblyLine/data/questions/al_visual.yml @@ -117,6 +117,7 @@ data from code: ( get_config('assembly line',{}).get('enable answer sets') and get_config('assembly line',{}).get('enable answer set imports') + and al_allow_answer_set_imports ) or (user_logged_in() and user_has_privilege('admin')) ) diff --git a/docassemble/AssemblyLine/data/sources/answer_set_import_samples/malformed_trailing_comma.json b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/malformed_trailing_comma.json new file mode 100644 index 00000000..75ae562f --- /dev/null +++ b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/malformed_trailing_comma.json @@ -0,0 +1,4 @@ +{ + "users_name": "Alex", + "city": "Boston", +} diff --git a/docassemble/AssemblyLine/data/sources/answer_set_import_samples/malicious_dunder_key.json b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/malicious_dunder_key.json new file mode 100644 index 00000000..873de7c5 --- /dev/null +++ b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/malicious_dunder_key.json @@ -0,0 +1,5 @@ +{ + "users_name": "Alex", + "__class__": "builtins.object", + "city": "Boston" +} diff --git a/docassemble/AssemblyLine/data/sources/answer_set_import_samples/malicious_internal_key.json b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/malicious_internal_key.json new file mode 100644 index 00000000..d6a21c11 --- /dev/null +++ b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/malicious_internal_key.json @@ -0,0 +1,7 @@ +{ + "users_name": "Alex", + "_internal": { + "steps": 99 + }, + "city": "Boston" +} diff --git a/docassemble/AssemblyLine/data/sources/answer_set_import_samples/object_graph_playground_alias.json b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/object_graph_playground_alias.json new file mode 100644 index 00000000..b46bb7b5 --- /dev/null +++ b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/object_graph_playground_alias.json @@ -0,0 +1,22 @@ +{ + "users": { + "_class": "docassemble.playground1.al_general.ALPeopleList", + "instanceName": "users", + "object_type": { + "_class": "type", + "name": "docassemble.playground1.al_general.ALIndividual" + }, + "elements": [ + { + "_class": "docassemble.playground1.al_general.ALIndividual", + "instanceName": "users[0]", + "name": { + "_class": "docassemble.playground1.al_general.IndividualName", + "instanceName": "users[0].name", + "first": "Client", + "last": "Example" + } + } + ] + } +} diff --git a/docassemble/AssemblyLine/data/sources/answer_set_import_samples/object_graph_unknown_class.json b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/object_graph_unknown_class.json new file mode 100644 index 00000000..e0d84862 --- /dev/null +++ b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/object_graph_unknown_class.json @@ -0,0 +1,6 @@ +{ + "users": { + "_class": "docassemble.bad.Actor", + "instanceName": "users" + } +} diff --git a/docassemble/AssemblyLine/data/sources/answer_set_import_samples/object_graph_with_reference.json b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/object_graph_with_reference.json new file mode 100644 index 00000000..bfc7b32c --- /dev/null +++ b/docassemble/AssemblyLine/data/sources/answer_set_import_samples/object_graph_with_reference.json @@ -0,0 +1,45 @@ +{ + "users": { + "_class": "docassemble.AssemblyLine.al_general.ALPeopleList", + "instanceName": "users", + "object_type": { + "_class": "type", + "name": "docassemble.AssemblyLine.al_general.ALIndividual" + }, + "elements": [ + { + "_class": "docassemble.AssemblyLine.al_general.ALIndividual", + "instanceName": "users[0]", + "name": { + "_class": "docassemble.base.util.IndividualName", + "instanceName": "users[0].name", + "first": "Client", + "last": "Example" + }, + "agent": { + "_class": "docassemble.AssemblyLine.al_general.ALIndividual", + "instanceName": "spouse" + }, + "custom_text": "notes", + "custom_float": 1.25, + "custom_dict": { + "_class": "docassemble.base.util.DADict", + "instanceName": "users[0].custom_dict", + "elements": { + "case": "A123" + } + } + }, + { + "_class": "docassemble.AssemblyLine.al_general.ALIndividual", + "instanceName": "spouse", + "name": { + "_class": "docassemble.base.util.IndividualName", + "instanceName": "spouse.name", + "first": "Spouse", + "last": "Example" + } + } + ] + } +} diff --git a/docassemble/AssemblyLine/sessions.py b/docassemble/AssemblyLine/sessions.py index 3f2bbe8d..a92b59eb 100644 --- a/docassemble/AssemblyLine/sessions.py +++ b/docassemble/AssemblyLine/sessions.py @@ -41,7 +41,9 @@ ALExhibitList, ALStaticDocument, ) +import docassemble.base.util import json +import math import os import re import hashlib @@ -69,6 +71,7 @@ "is_valid_json", "load_interview_answers", "load_interview_json", + "get_last_import_report", "nice_interview_subtitle", "rename_current_session", "rename_interview_answers", @@ -96,9 +99,13 @@ "metadata_title", "multi_user", "nav", + "role", + "role_event", + "role_needed", "session_local", "speak_text", "url_args", + "user_dict", "user_local", # Database-like fields we don't need to copy "all_courts", @@ -211,6 +218,562 @@ system_interviews: List[Dict[str, Any]] = interview_menu() +# Conservative defaults for untrusted JSON answer set imports. +DEFAULT_IMPORT_MAX_BYTES = 1024 * 1024 # 1 MB +DEFAULT_IMPORT_MAX_DEPTH = 40 +DEFAULT_IMPORT_MAX_KEYS = 20000 +DEFAULT_IMPORT_MAX_LIST_ITEMS = 5000 +DEFAULT_IMPORT_MAX_STRING_LENGTH = 200000 +DEFAULT_IMPORT_MAX_NUMBER_ABS = 10**15 + +PROTECTED_IMPORT_VARIABLES: Set[str] = { + # Dunder names are already rejected by SAFE_VARIABLE_NAME_RE; + # these explicit entries cover non-dunder runtime/module symbols. + "os", + "sys", + "subprocess", + "pickle", + "json", + "server", + "daconfig", +}.union({n for n in dir(docassemble.base.util) if not n.startswith("_")}) + +DANGEROUS_KEY_PREFIXES = ("__",) +DANGEROUS_KEY_EXACT = {"_internal", "_type", "@type"} +OBJECT_METADATA_KEYS = {"_class", "instanceName"} +PROTECTED_OBJECT_ATTRS = { + # Dunder attrs are already rejected by the DANGEROUS_KEY_PREFIXES check; + # these are non-dunder docassemble internals that must not be importable. + "this_thread", + "has_nonrandom_instance_name", + "attrList", +} +SAFE_ATTR_NAME_RE = re.compile(r"\A[A-Za-z_][A-Za-z0-9_]*\Z") +SAFE_INSTANCE_NAME_RE = re.compile( + r"\A[A-Za-z_][A-Za-z0-9_]*(?:\[(?:[0-9]+|'[A-Za-z0-9_ \-]+'|\"[A-Za-z0-9_ \-]+\")\]|\.[A-Za-z_][A-Za-z0-9_]*)*\Z" +) +SAFE_VARIABLE_NAME_RE = re.compile(r"\A[A-Za-z][A-Za-z0-9_]*\Z") + +# Conservative remaps for known equivalent DA/AL class names across packages +# (e.g. playground exports). Targets are still validated against allowlist. +KNOWN_CLASS_REMAP_BY_BASENAME: Dict[str, str] = { + "ALIndividual": "docassemble.AssemblyLine.al_general.ALIndividual", + "ALPeopleList": "docassemble.AssemblyLine.al_general.ALPeopleList", + "ALAddress": "docassemble.AssemblyLine.al_general.ALAddress", + "ALAddressList": "docassemble.AssemblyLine.al_general.ALAddressList", + "ALNameList": "docassemble.AssemblyLine.al_general.ALNameList", + "IndividualName": "docassemble.base.util.IndividualName", + "Name": "docassemble.base.util.Name", + "Address": "docassemble.base.util.Address", + "DADict": "docassemble.base.util.DADict", + "DAList": "docassemble.base.util.DAList", + "DASet": "docassemble.base.util.DASet", +} + +_last_import_report: Dict[str, Any] = { + "accepted": [], + "rejected": [], + "warnings": [], + "limits": {}, + "contains_objects": False, + "remapped_classes": [], +} + + +def get_last_import_report() -> Dict[str, Any]: + """Return a copy of the most recent JSON import report. + + The report dictionary contains the following keys: + - ``accepted``: list of variable names that were imported successfully. + - ``rejected``: list of dicts describing paths and reasons for rejection. + - ``warnings``: list of warning messages generated during import. + - ``limits``: the numerical limits that were applied to the payload. + - ``contains_objects``: ``True`` if docassemble object envelopes were detected. + - ``remapped_classes``: list of class remapping operations that occurred. + + Returns: + Dict[str, Any]: sanitized copy of the last import report. + """ + return safe_json(_last_import_report) + + +def _set_last_import_report(report: Dict[str, Any]) -> None: + global _last_import_report + _last_import_report = report + + +def _import_limits() -> Dict[str, int]: + cfg = get_config("assembly line", {}).get("answer set import limits", {}) + return { + "max_bytes": int(cfg.get("max bytes", DEFAULT_IMPORT_MAX_BYTES)), + "max_depth": int(cfg.get("max depth", DEFAULT_IMPORT_MAX_DEPTH)), + "max_keys": int(cfg.get("max keys", DEFAULT_IMPORT_MAX_KEYS)), + "max_list_items": int(cfg.get("max list items", DEFAULT_IMPORT_MAX_LIST_ITEMS)), + "max_string_length": int( + cfg.get("max string length", DEFAULT_IMPORT_MAX_STRING_LENGTH) + ), + "max_number_abs": int(cfg.get("max number abs", DEFAULT_IMPORT_MAX_NUMBER_ABS)), + "max_object_depth": int(cfg.get("max object depth", 10)), + } + + +def _parse_json_with_limits(json_string: str, limits: Dict[str, int]) -> Any: + if len(json_string.encode("utf-8", errors="ignore")) > limits["max_bytes"]: + raise ValueError("JSON file is too large") + try: + parsed = json.loads(json_string) + except json.JSONDecodeError as err: + raise ValueError("Invalid JSON format") from err + + key_count = 0 + stack: List[Tuple[Any, int]] = [(parsed, 0)] + while stack: + node, depth = stack.pop() + if depth > limits["max_depth"]: + raise ValueError("JSON nesting is too deep") + + if isinstance(node, dict): + key_count += len(node) + if key_count > limits["max_keys"]: + raise ValueError("JSON contains too many keys") + for key, val in node.items(): + if not isinstance(key, str): + raise ValueError("JSON object keys must be strings") + if len(key) > limits["max_string_length"]: + raise ValueError("JSON key is too long") + if key.startswith(DANGEROUS_KEY_PREFIXES): + raise ValueError( + f"JSON contains forbidden key '{key}' used in object reconstruction" + ) + stack.append((val, depth + 1)) + + elif isinstance(node, list): + if len(node) > limits["max_list_items"]: + raise ValueError("JSON list contains too many items") + for val in node: + stack.append((val, depth + 1)) + + elif isinstance(node, str): + if len(node) > limits["max_string_length"]: + raise ValueError("JSON contains a string value that is too long") + + elif isinstance(node, bool) or node is None: + continue + + elif isinstance(node, (int, float)): + if isinstance(node, float) and not math.isfinite(node): + raise ValueError("JSON contains a non-finite numeric value") + if abs(node) > limits["max_number_abs"]: + raise ValueError("JSON contains a numeric value outside allowed range") + + else: + raise ValueError("JSON contains an unsupported value type") + + return parsed + + +def _safe_variable_name(name: str) -> bool: + if not SAFE_VARIABLE_NAME_RE.match(name): + return False + if name in PROTECTED_IMPORT_VARIABLES: + return False + return True + + +def _allowed_import_variables() -> Optional[Set[str]]: + """Optional strict allowlist from config; when empty/undefined, default policy applies.""" + allowed = get_config("assembly line", {}).get("answer set import allowed variables") + if not allowed: + return None + cleaned = {str(x).strip() for x in allowed if str(x).strip()} + return cleaned if cleaned else None + + +def _allow_object_imports() -> bool: + return get_config("assembly line", {}).get("answer set import allow objects", True) + + +def _default_allowed_object_classes() -> Set[str]: + return { + "docassemble.base.core.DAObject", + "docassemble.base.util.DAObject", + "docassemble.base.util.DADict", + "docassemble.base.util.DAList", + "docassemble.base.util.DASet", + "docassemble.base.util.Individual", + "docassemble.base.util.Person", + "docassemble.base.util.Name", + "docassemble.base.util.IndividualName", + "docassemble.base.util.Address", + "docassemble.base.util.PeriodicValue", + "docassemble.base.util.FinancialList", + "docassemble.base.util.Income", + "docassemble.AssemblyLine.al_general.ALIndividual", + "docassemble.AssemblyLine.al_general.ALIndividualDict", + "docassemble.AssemblyLine.al_general.ALPeopleList", + "docassemble.AssemblyLine.al_general.ALAddress", + "docassemble.AssemblyLine.al_general.ALAddressList", + "docassemble.AssemblyLine.al_general.ALNameList", + "docassemble.AssemblyLine.al_courts.ALCourt", + "docassemble.AssemblyLine.al_courts.MACourt", + } + + +def _allowed_object_classes() -> Set[str]: + allowed = _default_allowed_object_classes() + configured = get_config("assembly line", {}).get( + "answer set import allowed object classes", [] + ) + for class_name in configured: + name = str(class_name).strip() + if name: + allowed.add(name) + return allowed + + +def _class_remap_table() -> Dict[str, str]: + remap = dict(KNOWN_CLASS_REMAP_BY_BASENAME) + configured = get_config("assembly line", {}).get( + "answer set import class remap", {} + ) + if isinstance(configured, dict): + for key, value in configured.items(): + source = str(key).strip() + target = str(value).strip() + if source and target: + remap[source] = target + return remap + + +def _maybe_remap_class_name( + class_name: str, + allowed_object_classes: Set[str], + remap_table: Dict[str, str], +) -> Tuple[str, bool]: + if class_name in allowed_object_classes: + return class_name, False + if not remap_table: + return class_name, False + basename = class_name.rsplit(".", 1)[-1] + target = remap_table.get(basename) + if target and target in allowed_object_classes: + return target, True + return class_name, False + + +def _is_safe_object_attr_name(name: str) -> bool: + if name in OBJECT_METADATA_KEYS: + return True + if name in PROTECTED_OBJECT_ATTRS: + return False + if name.startswith(DANGEROUS_KEY_PREFIXES): + return False + return bool(SAFE_ATTR_NAME_RE.match(name)) + + +def _is_safe_instance_name(name: str, max_length: int) -> bool: + if not isinstance(name, str): + return False + if len(name) == 0 or len(name) > max_length: + return False + if not SAFE_INSTANCE_NAME_RE.match(name): + return False + if "__" in name: + return False + return True + + +def _sanitize_import_value( + value: Any, + path: str, + limits: Dict[str, int], + allow_objects: bool, + allowed_object_classes: Set[str], + remapped_classes: List[Dict[str, str]], + remap_table: Dict[str, str], + blocked_keys: Set[str], + object_stack: Optional[List[str]] = None, +) -> Tuple[Any, bool]: + """Validate and sanitize nested imported values. + + Returns: + Tuple[Any, bool]: (sanitized value, contains object envelope) + """ + if object_stack is None: + object_stack = [] + + if isinstance(value, dict): + if value.get("_class") == "type": + if not allow_objects: + raise ValueError("object imports are disabled") + if set(value.keys()) != {"_class", "name"}: + raise ValueError("type envelope can only include _class and name") + class_name = value.get("name") + if not isinstance(class_name, str): + raise ValueError("type envelope name must be a string") + resolved_class, remapped = _maybe_remap_class_name( + class_name, allowed_object_classes, remap_table + ) + if resolved_class not in allowed_object_classes: + raise ValueError(f"type envelope class '{class_name}' is not allowed") + if remapped: + remapped_classes.append( + {"path": path, "from": class_name, "to": resolved_class} + ) + return {"_class": "type", "name": resolved_class}, True + + if "_class" in value or "instanceName" in value: + if not allow_objects: + raise ValueError("object imports are disabled") + if "_class" not in value or "instanceName" not in value: + raise ValueError( + "object metadata must include both _class and instanceName" + ) + + class_name = value.get("_class") + instance_name = value.get("instanceName") + if not isinstance(class_name, str): + raise ValueError("object _class must be a string") + if not isinstance(instance_name, str): + raise ValueError("object instanceName must be a string") + + if instance_name in blocked_keys: + raise ValueError(f"object instanceName '{instance_name}' is blocked") + if not _safe_variable_name(instance_name.split(".")[0].split("[")[0]): + raise ValueError( + f"object instanceName '{instance_name}' violates safe variable naming" + ) + + if instance_name in object_stack: + raise ValueError( + f"circular or repeating object reference detected: {instance_name}" + ) + if len(object_stack) >= limits.get("max_object_depth", 10): + raise ValueError("nested object envelope depth limit exceeded") + + resolved_class, remapped = _maybe_remap_class_name( + class_name, allowed_object_classes, remap_table + ) + if resolved_class not in allowed_object_classes: + raise ValueError(f"object class '{class_name}' is not allowed") + if not _is_safe_instance_name(instance_name, limits["max_string_length"]): + raise ValueError("object instanceName is invalid") + if remapped: + remapped_classes.append( + {"path": path, "from": class_name, "to": resolved_class} + ) + + obj_sanitized: Dict[str, Any] = { + "_class": resolved_class, + "instanceName": instance_name, + } + object_stack.append(instance_name) + for key, nested in value.items(): + if key in OBJECT_METADATA_KEYS: + continue + nested_path = f"{path}.{key}" + if key in blocked_keys or nested_path in blocked_keys: + continue + if not isinstance(key, str): + raise ValueError("object attribute name must be a string") + if not _is_safe_object_attr_name(key): + raise ValueError(f"unsafe object attribute '{key}'") + nested_sanitized, _ = _sanitize_import_value( + nested, + nested_path, + limits, + allow_objects, + allowed_object_classes, + remapped_classes, + remap_table, + blocked_keys, + object_stack, + ) + obj_sanitized[key] = nested_sanitized + object_stack.pop() + return obj_sanitized, True + + sanitized: Dict[str, Any] = {} + contains_object = False + for key, nested in value.items(): + nested_path = f"{path}.{key}" + if key in blocked_keys or nested_path in blocked_keys: + continue + if not isinstance(key, str): + raise ValueError("nested object key must be a string") + if key in DANGEROUS_KEY_EXACT: + raise ValueError(f"forbidden nested key '{key}'") + if key.startswith(DANGEROUS_KEY_PREFIXES): + raise ValueError(f"forbidden nested key '{key}'") + nested_sanitized, nested_contains_object = _sanitize_import_value( + nested, + nested_path, + limits, + allow_objects, + allowed_object_classes, + remapped_classes, + remap_table, + blocked_keys, + object_stack, + ) + sanitized[key] = nested_sanitized + contains_object = contains_object or nested_contains_object + return sanitized, contains_object + + if isinstance(value, list): + if len(value) > limits["max_list_items"]: + raise ValueError("JSON list contains too many items") + sanitized_list: List[Any] = [] + contains_object = False + for index, nested in enumerate(value): + nested_sanitized, nested_contains_object = _sanitize_import_value( + nested, + f"{path}[{index}]", + limits, + allow_objects, + allowed_object_classes, + remapped_classes, + remap_table, + blocked_keys, + object_stack, + ) + sanitized_list.append(nested_sanitized) + contains_object = contains_object or nested_contains_object + return sanitized_list, contains_object + + if isinstance(value, str): + if len(value) > limits["max_string_length"]: + raise ValueError("JSON contains a string value that is too long") + return value, False + + if isinstance(value, bool) or value is None: + return value, False + + if isinstance(value, (int, float)): + if isinstance(value, float) and not math.isfinite(value): + raise ValueError("JSON contains a non-finite numeric value") + if abs(value) > limits["max_number_abs"]: + raise ValueError("JSON contains a numeric value outside allowed range") + return value, False + + raise ValueError("JSON contains an unsupported value type") + + +def _get_target_interview_variables(filename: str) -> Set[str]: + """Extract variable names known to a specific interview from its parsed AST.""" + try: + from docassemble.base.interview_cache import get_interview + + interview = get_interview(filename) + except Exception: + return set() + + names_used = set() + if hasattr(interview, "names_used"): + names_used.update(interview.names_used) + if hasattr(interview, "questions_list"): + for q in interview.questions_list: + if hasattr(q, "mako_names"): + names_used.update(q.mako_names) + if hasattr(q, "names_used"): + names_used.update(q.names_used) + if hasattr(q, "fields_used"): + names_used.update(q.fields_used) + if hasattr(interview, "questions"): + for val in interview.questions: + names_used.add(val) + + return names_used + + +def _sanitize_json_import_payload( + payload: Any, + variables_to_filter: Optional[List[str]] = None, + target_filename: Optional[str] = None, +) -> Tuple[Dict[str, Any], Dict[str, Any]]: + if not isinstance(payload, dict): + raise ValueError("JSON root must be an object") + + blocked = set(al_sessions_variables_to_remove) + if variables_to_filter: + blocked = blocked.union(set(variables_to_filter)) + + allowed = _allowed_import_variables() + if target_filename: + target_vars = _get_target_interview_variables(target_filename) + if target_vars: + if allowed is None: + allowed = target_vars + else: + allowed = allowed.intersection(target_vars) + + limits = _import_limits() + allow_objects = _allow_object_imports() + allowed_object_classes = _allowed_object_classes() + al_config = get_config("assembly line", {}) + remap_enabled = al_config.get("answer set import remap known classes", True) + remap_table = _class_remap_table() if remap_enabled else {} + accepted: Dict[str, Any] = {} + rejected: List[Dict[str, str]] = [] + warnings: List[str] = [] + has_object_payload = False + remapped_classes: List[Dict[str, str]] = [] + + if not al_config.get("answer set import require signed", False): + warnings.append( + "Unsigned imports are enabled. All values are validated against structural limits and object allowlists." + ) + + for key, value in payload.items(): + if not isinstance(key, str): + rejected.append({"path": str(key), "reason": "non-string key"}) + continue + if key in blocked: + rejected.append({"path": key, "reason": "protected variable"}) + continue + if not _safe_variable_name(key): + rejected.append({"path": key, "reason": "unsafe variable name"}) + continue + if allowed is not None and key not in allowed: + rejected.append({"path": key, "reason": "not in allowlist"}) + continue + try: + sanitized_value, nested_object = _sanitize_import_value( + value, + key, + limits, + allow_objects, + allowed_object_classes, + remapped_classes, + remap_table, + blocked, + ) + accepted[key] = sanitized_value + has_object_payload = has_object_payload or nested_object + except Exception as err: + rejected.append({"path": key, "reason": str(err)}) + continue + + if has_object_payload: + warnings.append( + "Object payload detected. Import will reconstruct only allowlisted docassemble object classes." + ) + elif not allow_objects: + warnings.append( + "Object imports are disabled; only plain JSON values are accepted." + ) + + report = { + "accepted": sorted(list(accepted.keys())), + "rejected": rejected, + "warnings": warnings, + "limits": limits, + "contains_objects": has_object_payload, + "remapped_classes": remapped_classes, + } + return accepted, report + + def _package_name(package_name: Optional[str] = None): """Get package name without the name of the current module, like: docassemble.ALWeaver instead of docassemble.ALWeaver.advertise_capabilities @@ -1470,7 +2033,7 @@ def load_interview_json( new_session: bool = False, new_interview_filename: Optional[str] = None, variables_to_filter: Optional[List[str]] = None, -) -> Optional[int]: +) -> Optional[Union[int, bool]]: """ Given a JSON string, this function loads the specified variables into a Docassemble session. JSON strings containing annotated class names will be transformed into Docassemble objects. @@ -1485,21 +2048,66 @@ def load_interview_json( Returns: Optional[Union[int, bool]]: ID of the newly created session if `new_session` is True, otherwise True or False based on success. """ - json_processed = json.loads(json_string) + limits = _import_limits() + target_filename = new_interview_filename or current_context().filename + try: + parsed = _parse_json_with_limits(json_string, limits) + json_processed, report = _sanitize_json_import_payload( + parsed, + variables_to_filter=variables_to_filter, + target_filename=target_filename, + ) + report["limits"] = limits + except Exception as err: + _set_last_import_report( + { + "accepted": [], + "rejected": [ + { + "path": "$", + "reason": str(err), + } + ], + "warnings": [], + "limits": limits, + } + ) + return False + + if not json_processed: + report["warnings"].append("No variables were imported from the provided JSON.") + _set_last_import_report(report) + return False if new_session: if not new_interview_filename: new_interview_filename = current_context().filename new_session_id = create_session(new_interview_filename) set_session_variables( - new_interview_filename, new_session_id, json_processed, process_objects=True + new_interview_filename, + new_session_id, + json_processed, + process_objects=bool(report.get("contains_objects", False)), ) + _set_last_import_report(report) return new_session_id else: try: - set_variables(json_processed, process_objects=True) + set_variables( + json_processed, + process_objects=bool(report.get("contains_objects", False)), + ) + _set_last_import_report(report) return True - except: + except Exception: + log("Answer set import failed while setting variables") + report["rejected"].append( + { + "path": "$", + "reason": "failed to apply imported variables to session", + } + ) + _set_last_import_report(report) return False @@ -1551,9 +2159,9 @@ def is_valid_json(json_string: str) -> bool: bool: True if the string is a valid JSON, otherwise it raises a validation error and returns False. """ try: - json.loads(json_string) - except: - validation_error("Enter a valid JSON-formatted string") + _parse_json_with_limits(json_string, _import_limits()) + except Exception as err: + validation_error(str(err)) return False return True diff --git a/docassemble/AssemblyLine/test_sessions.py b/docassemble/AssemblyLine/test_sessions.py new file mode 100644 index 00000000..5afd61d5 --- /dev/null +++ b/docassemble/AssemblyLine/test_sessions.py @@ -0,0 +1,446 @@ +# do not pre-load + +import json +import os +import tempfile +import unittest +from unittest.mock import patch + + +def _ensure_da_test_config() -> None: + """Docassemble modules require a config file path during import.""" + if os.environ.get("DA_CONFIG_FILE"): + return + config_path = os.path.join(tempfile.gettempdir(), "assemblyline-test-config.yml") + if not os.path.exists(config_path): + with open(config_path, "w", encoding="utf-8") as f: + f.write("secretkey: testing-secret\n") + f.write("url root: http://localhost\n") + os.environ["DA_CONFIG_FILE"] = config_path + + +_ensure_da_test_config() + +from . import sessions + + +class TestAnswerSetImportSafety(unittest.TestCase): + def setUp(self): + current_dir = os.path.dirname(os.path.abspath(__file__)) + self.samples_dir = os.path.join( + current_dir, "data", "sources", "answer_set_import_samples" + ) + + def _sample(self, filename: str) -> str: + with open(os.path.join(self.samples_dir, filename), "r", encoding="utf-8") as f: + return f.read() + + def test_parse_rejects_malformed_json_file(self): + bad_json = self._sample("malformed_trailing_comma.json") + with self.assertRaises(ValueError): + sessions._parse_json_with_limits(bad_json, sessions._import_limits()) + + def test_parse_rejects_dunder_object_key(self): + bad_json = self._sample("malicious_dunder_key.json") + with self.assertRaises(ValueError) as cm: + sessions._parse_json_with_limits(bad_json, sessions._import_limits()) + self.assertIn("forbidden key", str(cm.exception)) + + def test_sanitize_filters_internal_key(self): + bad_json = self._sample("malicious_internal_key.json") + payload = sessions._parse_json_with_limits(bad_json, sessions._import_limits()) + accepted, report = sessions._sanitize_json_import_payload(payload) + self.assertEqual(accepted, {"users_name": "Alex", "city": "Boston"}) + self.assertTrue(any(item["path"] == "_internal" for item in report["rejected"])) + + def test_parse_rejects_when_list_item_limit_exceeded(self): + limits = sessions._import_limits() + limits["max_list_items"] = 3 + payload = json.dumps({"names": ["a", "b", "c", "d"]}) + with self.assertRaises(ValueError) as cm: + sessions._parse_json_with_limits(payload, limits) + self.assertIn("too many items", str(cm.exception)) + + @patch("docassemble.AssemblyLine.sessions.get_config") + def test_sanitize_respects_allowlist(self, mock_get_config): + def mock_get_config_func(section, default=None): + if section == "assembly line": + return { + "answer set import allowed variables": ["users_name"], + "answer set import require signed": False, + } + return default + + mock_get_config.side_effect = mock_get_config_func + payload = { + "users_name": "Alex", + "users_city": "Boston", + "user_started_case": True, + } + + accepted, report = sessions._sanitize_json_import_payload(payload) + + self.assertEqual(accepted, {"users_name": "Alex"}) + self.assertTrue( + any(item["reason"] == "not in allowlist" for item in report["rejected"]) + ) + self.assertTrue( + any(item["reason"] == "protected variable" for item in report["rejected"]) + ) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_load_interview_json_partial_import_with_report(self, mock_set_variables): + payload = json.dumps( + { + "users_name": "Alex", + "user_started_case": True, + } + ) + + result = sessions.load_interview_json(payload) + report = sessions.get_last_import_report() + + self.assertTrue(result) + mock_set_variables.assert_called_once_with( + {"users_name": "Alex"}, process_objects=False + ) + self.assertIn("users_name", report["accepted"]) + self.assertTrue( + any(item["path"] == "user_started_case" for item in report["rejected"]) + ) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_load_interview_json_allows_object_graph_and_references( + self, mock_set_variables + ): + object_json = self._sample("object_graph_with_reference.json") + + result = sessions.load_interview_json(object_json) + report = sessions.get_last_import_report() + + self.assertTrue(result) + args, kwargs = mock_set_variables.call_args + self.assertIn("users", args[0]) + self.assertTrue(kwargs.get("process_objects")) + users_obj = args[0]["users"] + self.assertEqual( + users_obj["_class"], "docassemble.AssemblyLine.al_general.ALPeopleList" + ) + self.assertEqual(users_obj["elements"][0]["agent"]["instanceName"], "spouse") + self.assertEqual(users_obj["elements"][0]["custom_text"], "notes") + self.assertEqual(users_obj["elements"][0]["custom_float"], 1.25) + self.assertIn("users", report["accepted"]) + self.assertTrue(report.get("contains_objects")) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_load_interview_json_rejects_unknown_object_class(self, mock_set_variables): + object_json = self._sample("object_graph_unknown_class.json") + + result = sessions.load_interview_json(object_json) + report = sessions.get_last_import_report() + + self.assertFalse(result) + mock_set_variables.assert_not_called() + self.assertTrue(any(item["path"] == "users" for item in report["rejected"])) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_load_interview_json_remaps_known_playground_classes( + self, mock_set_variables + ): + object_json = self._sample("object_graph_playground_alias.json") + + result = sessions.load_interview_json(object_json) + report = sessions.get_last_import_report() + + self.assertTrue(result) + args, kwargs = mock_set_variables.call_args + self.assertTrue(kwargs.get("process_objects")) + users_obj = args[0]["users"] + self.assertEqual( + users_obj["_class"], "docassemble.AssemblyLine.al_general.ALPeopleList" + ) + self.assertEqual( + users_obj["object_type"]["name"], + "docassemble.AssemblyLine.al_general.ALIndividual", + ) + self.assertEqual( + users_obj["elements"][0]["_class"], + "docassemble.AssemblyLine.al_general.ALIndividual", + ) + self.assertEqual( + users_obj["elements"][0]["name"]["_class"], + "docassemble.base.util.IndividualName", + ) + self.assertTrue(len(report.get("remapped_classes", [])) >= 1) + + def test_load_interview_json_returns_false_for_invalid_json(self): + bad_json = self._sample("malformed_trailing_comma.json") + + result = sessions.load_interview_json(bad_json) + report = sessions.get_last_import_report() + + self.assertFalse(result) + self.assertTrue(any(item["path"] == "$" for item in report["rejected"])) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_rejects_dunder_in_instance_name(self, mock_set_variables): + payload = json.dumps( + { + "evil": { + "_class": "docassemble.base.util.DAObject", + "instanceName": "evil.__class__", + } + } + ) + + result = sessions.load_interview_json(payload) + report = sessions.get_last_import_report() + + self.assertFalse(result) + mock_set_variables.assert_not_called() + self.assertTrue( + any("instanceName" in item.get("reason", "") for item in report["rejected"]) + ) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_type_envelope_bypass(self, mock_set_variables): + # Tries to bypass type envelope allowlisting but specifying an unauthorized class + payload = json.dumps( + { + "evil_type": { + "_class": "type", + "name": "subprocess.Popen", + } + } + ) + sessions.load_interview_json(payload) + report = sessions.get_last_import_report() + self.assertTrue( + any( + item["path"] == "evil_type" and "not allowed" in item.get("reason", "") + for item in report["rejected"] + ) + ) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_object_attr_dunder(self, mock_set_variables): + # Tries to overwrite __globals__ via object attribute + payload = json.dumps( + { + "users": { + "_class": "docassemble.base.util.DAObject", + "instanceName": "users", + "__globals__": {"evil": "code"}, + } + } + ) + sessions.load_interview_json(payload) + report = sessions.get_last_import_report() + # the key check fails at structural parsing phase before getting to sanitizer object validation, + # which means the entire document is rejected with path '$' + self.assertTrue( + any( + item["path"] == "$" + and "forbidden key '__globals__'" in item.get("reason", "") + for item in report["rejected"] + ) + ) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_deep_nesting(self, mock_set_variables): + # Depth limit bypass attempt + payload = {"bomb": "a"} + for i in range(45): + payload = {"bomb": payload} + + with self.assertRaises(ValueError) as cm: + sessions._parse_json_with_limits( + json.dumps(payload), sessions._import_limits() + ) + self.assertIn("too deep", str(cm.exception)) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_massive_string(self, mock_set_variables): + # Tries to eat memory with huge string + payload = {"big_string": "A" * 300000} + with self.assertRaises(ValueError) as cm: + sessions._parse_json_with_limits( + json.dumps(payload), sessions._import_limits() + ) + self.assertIn("too long", str(cm.exception)) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_number_overflow(self, mock_set_variables): + # Tries to DOS via large float parsing/math + payload = {"big_num": 1e20} + with self.assertRaises(ValueError) as cm: + sessions._parse_json_with_limits( + json.dumps(payload), sessions._import_limits() + ) + self.assertIn("outside allowed range", str(cm.exception)) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_newline_in_var_name(self, mock_set_variables): + # Exploits ^/$ regex bugs with newlines + payload = json.dumps( + { + "users\nname": "evil", + } + ) + sessions.load_interview_json(payload) + report = sessions.get_last_import_report() + self.assertTrue( + any( + item["path"] == "users\nname" and "unsafe" in item.get("reason", "") + for item in report["rejected"] + ) + ) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_protected_import_vars(self, mock_set_variables): + # Tries to overwrite crucial DA modules + payload = json.dumps( + {"server": "hacked", "daconfig": "hacked", "pickle": "hacked"} + ) + sessions.load_interview_json(payload) + report = sessions.get_last_import_report() + for var in ["server", "daconfig", "pickle"]: # handled in _safe_variable_name + self.assertTrue( + any( + item["path"] == var + and "unsafe variable name" in item.get("reason", "") + for item in report["rejected"] + ) + ) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_unicode_normalization_bypass(self, mock_set_variables): + # Tries to bypass variable allowlist with homoglyphs + # 'OS' is FULLWIDTH LATIN CAPITAL LETTER O and S + payload = json.dumps( + { + "OS": "evil", + } + ) + sessions.load_interview_json(payload) + report = sessions.get_last_import_report() + self.assertTrue( + any( + item["path"] == "OS" and "unsafe" in item.get("reason", "") + for item in report["rejected"] + ) + ) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_type_envelope_malformed(self, mock_set_variables): + # Tests that a malformed _class=type envelope is completely rejected + payload = json.dumps( + { + "valid_var": { + "_class": "type", + "name": "docassemble.base.util.DAObject", + "evil_extra_key": "injected", + } + } + ) + sessions.load_interview_json(payload) + report = sessions.get_last_import_report() + self.assertTrue( + any( + item["path"] == "valid_var" and "only include" in item.get("reason", "") + for item in report["rejected"] + ) + ) + + @patch("docassemble.AssemblyLine.sessions.set_variables") + def test_adversarial_object_envelope_malformed(self, mock_set_variables): + # Tests that a docassemble object envelope without an instanceName is rejected + payload = json.dumps( + { + "users": { + "_class": "docassemble.base.util.DAObject", + } + } + ) + sessions.load_interview_json(payload) + report = sessions.get_last_import_report() + self.assertTrue( + any( + item["path"] == "users" + and "metadata must include both" in item.get("reason", "") + for item in report["rejected"] + ) + ) + + @patch("docassemble.AssemblyLine.sessions.validation_error") + def test_is_valid_json_reports_validation_error(self, mock_validation_error): + bad_json = self._sample("malformed_trailing_comma.json") + + is_valid = sessions.is_valid_json(bad_json) + + self.assertFalse(is_valid) + mock_validation_error.assert_called_once() + + @patch("docassemble.base.interview_cache.get_interview") + @patch("docassemble.AssemblyLine.sessions.current_context") + @patch("docassemble.AssemblyLine.sessions.set_variables") + @patch("docassemble.AssemblyLine.sessions.get_config") + def test_target_interview_vars( + self, mock_get_config, mock_set_vars, mock_current_context, mock_get_interview + ): + def mock_get_config_func(section, default=None): + return default + + mock_get_config.side_effect = mock_get_config_func + + class QStub: + pass + + class MockInterview: + def __init__(self): + self.names_used = {"user_name"} + q1 = QStub() + q1.mako_names = {"mako_var"} + q1.names_used = {"name_var"} + q1.fields_used = {"field_var"} + self.questions_list = [q1] + self.questions = {"q_val"} + + class MockCurrentContext: + def __init__(self): + self.filename = "docassemble.MyTest:test.yml" + self.session = "123" + + mock_get_interview.return_value = MockInterview() + mock_current_context.return_value = MockCurrentContext() + + payload = json.dumps( + { + "user_name": "Alice", + "mako_var": "val1", + "q_val": "val2", + "bad_var": "hacked", + } + ) + + sessions.load_interview_json(payload) + + report = sessions.get_last_import_report() + accepted_keys = report["accepted"] + rejected_keys = [r["path"] for r in report["rejected"]] + + self.assertIn("user_name", accepted_keys) + self.assertIn("mako_var", accepted_keys) + self.assertIn("q_val", accepted_keys) + self.assertIn("bad_var", rejected_keys) + self.assertTrue( + any( + "not in allowlist" in r["reason"] + for r in report["rejected"] + if r["path"] == "bad_var" + ) + ) + + +if __name__ == "__main__": + unittest.main()