Keep model input data unchanged in SDK (#6455)

Nested containers may be modified during the model input data parsing in class constructors. This can lead to subtle memory errors, which are very hard to find. In CVAT, this helps to avoid unexpected problems in tests, such as one test affecting another one by subtly changing test assets.

Keep model input data unchanged in SDK (#6455)
Nested containers may be modified during the model input data parsing in class constructors. This can lead to subtle memory errors, which are very hard to find. In CVAT, this helps to avoid unexpected problems in tests, such as one test affecting another one by subtly changing test assets.
31b5e25c · Maxim Zhiltsov · GitHub · 03975ea4 · 31b5e25c · 31b5e25c
3 changed file
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 - \[SDK\] Ability to create attributes with blank default values
  (<https://github.com/opencv/cvat/pull/6454>)
+- \[SDK\] SDK should not change input data in models (<https://github.com/opencv/cvat/pull/6455>)

 ### Security
 - TDB

--- a/cvat-sdk/gen/templates/openapi-generator/model_utils.mustache
+++ b/cvat-sdk/gen/templates/openapi-generator/model_utils.mustache
 {{>partial_header}}

 from datetime import date, datetime  # noqa: F401
-from copy import deepcopy
+from copy import copy, deepcopy
 import inspect
 import io
 import os
@@ -1310,14 +1310,18 @@ def validate_and_convert_types(input_value, required_types_mixed, path_to_item,
    if inner_required_types is None:
        # for this type, there are not more inner variables left to look at
        return input_value
+
    if isinstance(input_value, list):
-        if input_value == []:
+        # avoid storing and changing the input value when the type is mutable collection
+        output_value = copy(input_value)
+
+        if output_value == []:
            # allow an empty list
-            return input_value
-        for index, inner_value in enumerate(input_value):
+            return output_value
+        for index, inner_value in enumerate(output_value):
            inner_path = list(path_to_item)
            inner_path.append(index)
-            input_value[index] = validate_and_convert_types(
+            output_value[index] = validate_and_convert_types(
                inner_value,
                inner_required_types,
                inner_path,
@@ -1326,16 +1330,19 @@ def validate_and_convert_types(input_value, required_types_mixed, path_to_item,
                configuration=configuration
            )
    elif isinstance(input_value, dict):
-        if input_value == {}:
+        # avoid storing and changing the input value when the type is mutable collection
+        output_value = copy(input_value)
+
+        if output_value == {}:
            # allow an empty dict
-            return input_value
-        for inner_key, inner_val in input_value.items():
+            return output_value
+        for inner_key, inner_val in output_value.items():
            inner_path = list(path_to_item)
            inner_path.append(inner_key)
            if get_simple_class(inner_key) != str:
                raise get_type_error(inner_key, inner_path, valid_classes,
                                     key_type=True)
-            input_value[inner_key] = validate_and_convert_types(
+            output_value[inner_key] = validate_and_convert_types(
                inner_val,
                inner_required_types,
                inner_path,
@@ -1343,7 +1350,10 @@ def validate_and_convert_types(input_value, required_types_mixed, path_to_item,
                _check_type,
                configuration=configuration
            )
-    return input_value
+    else:
+        output_value = input_value
+
+    return output_value


 def model_to_dict(model_instance, serialize=True):
@@ -1382,24 +1392,20 @@ def model_to_dict(model_instance, serialize=True):
                except KeyError:
                    used_fallback_python_attribute_names.add(attr)
            if isinstance(value, list):
-                if not value:
-                    # empty list or None
-                    result[attr] = value
-                else:
-                    res = []
-                    for v in value:
-                        if isinstance(v, PRIMITIVE_TYPES) or v is None:
-                            res.append(v)
-                        elif isinstance(v, ModelSimple):
-                            res.append(v.value)
-                        elif isinstance(v, dict):
-                            res.append(dict(map(
-                                extract_item,
-                                v.items()
-                            )))
-                        else:
-                            res.append(model_to_dict(v, serialize=serialize))
-                    result[attr] = res
+                res = []
+                for v in value:
+                    if isinstance(v, PRIMITIVE_TYPES) or v is None:
+                        res.append(v)
+                    elif isinstance(v, ModelSimple):
+                        res.append(v.value)
+                    elif isinstance(v, dict):
+                        res.append(dict(map(
+                            extract_item,
+                            v.items()
+                        )))
+                    else:
+                        res.append(model_to_dict(v, serialize=serialize))
+                result[attr] = res
            elif isinstance(value, dict):
                result[attr] = dict(map(
                    extract_item,

--- a/tests/python/sdk/test_api_wrappers.py
+++ b/tests/python/sdk/test_api_wrappers.py
+# Copyright (C) 2023 CVAT.ai Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from copy import deepcopy
+
+from cvat_sdk import models
+from deepdiff import DeepDiff
+
+
+def test_models_do_not_change_input_values():
+    # Nested containers may be modified during the model input data parsing.
+    # This can lead to subtle memory errors, which are very hard to find.
+    original_input_data = {
+        "name": "test",
+        "labels": [
+            {
+                "name": "cat",
+                "attributes": [
+                    {
+                        "default_value": "yy",
+                        "input_type": "text",
+                        "mutable": False,
+                        "name": "x",
+                        "values": ["yy"],
+                    },
+                    {
+                        "default_value": "1",
+                        "input_type": "radio",
+                        "mutable": False,
+                        "name": "y",
+                        "values": ["1", "2"],
+                    },
+                ],
+            }
+        ],
+    }
+
+    input_data = deepcopy(original_input_data)
+
+    models.TaskWriteRequest(**input_data)
+
+    assert DeepDiff(original_input_data, input_data) == {}
+
+
+def test_models_do_not_store_input_collections():
+    # Avoid depending on input data for collection fields after the model is initialized.
+    # This can lead to subtle memory errors and unexpected behavior
+    # if the original input data is modified.
+    input_data = {
+        "name": "test",
+        "labels": [
+            {
+                "name": "cat1",
+                "attributes": [
+                    {
+                        "default_value": "yy",
+                        "input_type": "text",
+                        "mutable": False,
+                        "name": "x",
+                        "values": ["yy"],
+                    },
+                    {
+                        "default_value": "1",
+                        "input_type": "radio",
+                        "mutable": False,
+                        "name": "y",
+                        "values": ["1", "2"],
+                    },
+                ],
+            },
+            {"name": "cat2", "attributes": []},
+        ],
+    }
+
+    model = models.TaskWriteRequest(**input_data)
+    model_data1 = model.to_dict()
+
+    # Modify input value containers
+    input_data["labels"][0]["attributes"].clear()
+    input_data["labels"][1]["attributes"].append(
+        {
+            "default_value": "",
+            "input_type": "text",
+            "mutable": True,
+            "name": "z",
+        }
+    )
+    input_data["labels"].append({"name": "dog"})
+
+    model_data2 = model.to_dict()
+
+    assert DeepDiff(model_data1, model_data2) == {}
+
+
+def test_models_do_not_return_internal_collections():
+    # Avoid returning internal data for mutable collection fields.
+    # This can lead to subtle memory errors and unexpected behavior
+    # if the returned data is modified.
+    input_data = {
+        "name": "test",
+        "labels": [],
+    }
+
+    model = models.TaskWriteRequest(**input_data)
+    model_data1 = model.to_dict()
+    model_data1_original = deepcopy(model_data1)
+
+    # Modify an output value container
+    model_data1["labels"].append({"name": "dog"})
+
+    model_data2 = model.to_dict()
+
+    assert DeepDiff(model_data1_original, model_data2) == {}