Source code for virtual_ecosystem.core.config_builder

"""The :mod:`~virtual_ecosystem.core.config_builder` provides tools to load a set of
TOML formatted configuration dictionaries, either from files or from strings. String
inputs are primarily intended for use in configuring models for testing, where it is
more convenient to simply provide a string.

The main class :class:`ConfigurationLoader` handles the loading of configuration data
and compiling multiple sources into a single dictionary of configuration data.

The :func:`generate_configuration` function then:

* takes a compiled dictionary of configuration settings,
* assembles a pydantic validation model class using the configuration validators for
  each of the requested science modules, and
* passes the data through the validator to return a validated configuration model for
  the simulation.

Canonical usage patterns for the module would be:

.. code-block:: python

    config_data = ConfigurationLoader(...)
    config_object = generate_configuration(config_data.data)

"""  # noqa: D205

import tomllib
from collections.abc import Sequence
from copy import deepcopy
from pathlib import Path
from typing import Any

from pydantic import ValidationError, create_model

from virtual_ecosystem.core.configuration import CompiledConfiguration
from virtual_ecosystem.core.exceptions import ConfigurationError
from virtual_ecosystem.core.logger import LOGGER
from virtual_ecosystem.core.registry import (
    DISTURBANCE_REGISTRY,
    MODULE_REGISTRY,
    register_disturbance,
    register_module,
)



[docs]
def merge_configuration_dicts(
    dest: dict, source: dict, **kwargs
) -> tuple[dict, set[str]]:
    """Recursively merge two configuration dictionaries.

    This function returns a copy of the input ``dest`` dictionary that has been extended
    recursively with the entries from the input ``source`` dictionary.

    The merging process looks for duplicated settings. In general, if two input
    dictionaries share complete key paths (that is a set of nested dictionary keys
    leading to a value) then that indicates a duplicated setting. The values might be
    identical, but the configuration files should not duplicate settings. When
    duplicated key paths are found, the value from the source dictionary is used and the
    function extends the returned ``conflicts`` set with the duplicated key path.

    However an exception is where both entries are lists - for example, resulting from a
    TOML array of tables (https://toml.io/en/v1.0.0#array-of-tables). In this case, it
    is reasonable to append the source values to the destination values. The motivating
    example here are `[[core.data.variable]]` entries, which can quite reasonably be
    split across configuration sources. Note that no attempt is made to check that the
    combined values are congruent - this is deferred to error handling when the
    configuration data is loaded.

    Args:
        dest: A dictionary to extend
        source: A dictionary of key value pairs to extend ``dest``
        **kwargs: Additional arguments used in recursion

    Returns:
        A copy of dest, extended recursively with values from source, and a tuple of
        duplicate key paths.
    """

    # Copy inputs to avoid mangling inputs
    dest = deepcopy(dest)
    source = deepcopy(source)

    # Populate conflicts and path from defaults or kwargs. These are not provided as
    # explicit arguments, because they would never really be used outside of recursion
    # and so are not really part of the API.
    conflicts: set = kwargs.get("conflicts", set())
    path: str | None = kwargs.get("path", None)

    # Loop over the elements in the source dictionary
    for src_key, src_val in source.items():
        # Get the source key from the dest dictionary and then check for three possible
        # outcomes of comparing dest_val and src_val
        dest_val = dest.get(src_key)

        if isinstance(dest_val, dict) and isinstance(src_val, dict):
            # Both values for this key are dictionaries, so recurse, extending the path
            next_path = src_key if path is None else f"{path}.{src_key}"
            dest[src_key], conflicts = merge_configuration_dicts(
                dest_val, src_val, conflicts=conflicts, path=next_path
            )
        elif isinstance(dest_val, list) and isinstance(src_val, list):
            # Both values for this key are lists, so merge the lists
            dest[src_key] = [*dest_val, *src_val]
        elif dest_val is None:
            # The key is not currently in dest, so add the key value pair
            dest[src_key] = src_val
        else:
            # The key is in _both_, so override destval with srcval to keep processing,
            # but extend the conflicts set with the path to the conflicting key.
            dest[src_key] = src_val
            conflict_path = src_key if path is None else f"{path}.{src_key}"
            conflicts.add(conflict_path)

            # NOTE: Could extend here to check for dest_val == src_val and then ignore
            #       duplicate matching definitions, but cleaner to just forbid overlap.

    return dest, conflicts




[docs]
def compile_configuration_data(data: list[dict]) -> tuple[dict, set[str]]:
    """Compile a combined configuration multiple configuration dictionaries.

    This method sequentially merges configuration dictionaries, such as those loaded
    from multiple individual configuration files, into a single configuration
    dictionary. It returns the merged dictionary and a set of keys that have duplicated
    definitions in the input files.
    """

    # Handle empty lists
    if len(data) == 0:
        LOGGER.warning("No config files set")
        return {}, set()

    # Just return the contents for a singleton list
    if len(data) == 1:
        return data[0], set()

    # Otherwise, merge other dicts into first
    compiled = data[0]

    for src in data[1:]:
        compiled, conflicts = merge_configuration_dicts(compiled, src)

    return compiled, conflicts




[docs]
def _resolve_config_paths(config_dir: Path, config_dict: dict[str, Any]) -> None:
    """Resolve paths in a configuration file.

    Configuration files may contain keys providing file paths for data and other
    settings: these paths may be absolute but also could be relative to the specific
    configuration file. This becomes a problem when configurations are compiled across
    multiple configuration files, possibly in different locations, so this function
    searches the configuration dictionary loaded from a single file and updates
    configured relative paths to their absolute paths.

    At present, the configuration schema does not have an explicit mechanism to type a
    configuration option as being a path, so we currently use the `_path` suffix to
    indicate configuration options setting a path. So, this function recursively search
    a configuration file payload for values stored under keys ending in `_path` and
    resolves the paths.

    It does not attempt to resolve paths when the value starts with ``$`` as these are
    taken to be marker values for used in path substitution.

    Args:
        config_dir: A folder containing a configuration file.
        config_dict: A dictionary of contents of the configuration file, which may
            contain file paths to resolve.

    Raises:
        ValueError: if a key ending in ``_path`` has a non-string value.
    """

    if not config_dir.is_absolute():
        config_dir = config_dir.absolute()

    for key, item in config_dict.items():
        if isinstance(item, dict):
            _resolve_config_paths(config_dir=config_dir, config_dict=item)
        elif isinstance(item, list):
            for list_entry in item:
                if isinstance(list_entry, dict):
                    _resolve_config_paths(config_dir=config_dir, config_dict=list_entry)
        elif key.endswith("_path"):
            if not isinstance(item, str):
                raise ValueError(
                    f"The value for config key '{key}' is not a string: {item}"
                )
            # Do not resolve file markers
            if not item.startswith("$"):
                # Otherwise update the entry if the path is relative
                file_path = Path(item)
                if not file_path.is_absolute():
                    # The resolve method is used here because it is the only method to
                    # resolve ../ entries from relative file paths and then the path is
                    # made explicitly absolute
                    file_resolved = (config_dir / file_path).resolve().absolute()

                    config_dict[key] = str(file_resolved)




[docs]
class ConfigurationLoader:
    """Configuration loading.

    The ``ConfigurationLoader`` class is used to load and compile configuration data for
    a Virtual Ecosystem simulation. Configuration data can be passed in as one of:

    * a list of paths to individual TOML configuration files or directories of TOML
      files  (the ``cfg_paths`` argument) or
    * a list of TOML strings providing configuration data (the ``cfg_strings``
      argument).

    In both cases, there is initial input validation of the argument values and then two
    data handling steps are run.

    Data loading
    ~~~~~~~~~~~~

    The :meth:`_load_data` method handles the parsing of the TOML inputs. For
    configuration data passed as strings, this is largely checking that the data is
    valid TOML.

    For configuration data passed as paths, the following steps occur:

    * The :meth:`_collect_config_paths` method is used to compile a complete list of the
      individual TOML files to be used to build the configuration from the provided
      paths.

    * The :meth:`_load_config_toml` method is then
      used to parse the TOML content of each file, verifying that is valid TOML, and
      then store the parsed contents.

    * The :meth:`_resolve_config_file_paths` method is then used to to update file paths
      in configuration inputs to resolve them to absolute file paths. This is so that
      the file paths in the final compiled configuration data are all mutually
      resolvable, as the input files may use relative paths and do not necessarily all
      live in the same directory.

    At the end of this step, the :attr:`toml_contents` attribute will have been
    populated with individual parsed dictionaries of configuration data from each file
    or input string.

    Data compilation
    ~~~~~~~~~~~~~~~~

    The :meth:`_compile_data` method is then run to compile the different individual
    dictionaries into a single configuration document. This method checks that
    configuration settings are uniquely set across the various configuration data
    sources. The :attr:`data` attribute then contains the complete compiled set of
    configuration data from the provided sources.


    Args:
        cfg_paths: A string, Path or list of strings or Paths giving configuration
            file or directory paths.
        cfg_strings: A string or list of strings containing TOML formatted configuration
            data.
        cli_config: Configuration settings provided by the user at the command line,
            used to override configuration settings in files.
        autoload: A boolean flag that can be used to turn off automatic data loading and
            compilation.
    """

    def __init__(
        self,
        cfg_paths: str | Path | Sequence[str | Path] = [],
        cfg_strings: str | list[str] = [],
        cli_config: dict[str, Any] | None = None,
        autoload: bool = True,
    ) -> None:
        # Define attributes
        self.cfg_paths: list[Path] = []
        """The configuration file paths, normalised from the cfg_paths argument."""
        self.toml_files: list[str | Path] = []
        """A list of TOML file paths resolved from the initial config paths."""
        self.cfg_strings: list[str] = []
        """A list of strings containing TOML content, provided by the ``cfg_strings``
        argument."""
        self.toml_contents: dict[str | Path, dict] = {}
        """A dictionary of the parsed TOML contents of config files or strings, keyed by
        file path or string index."""
        self.merge_conflicts: list = []
        """A list of configuration keys duplicated across configuration files."""
        self.config_errors: list[tuple[str, Any]] = []
        """Configuration errors, as a list of tuples of key path and error details."""
        self.from_cfg_strings: bool = False
        """A boolean flag indicating whether paths or strings were used to create the
        instance."""
        self.model_classes: dict[str, Any] = {}  # FIXME: -> dict[str, Type[BaseModel]]
        """A dictionary of the model classes specified in the configuration, keyed by
        model name."""
        self.cli_config: dict[str, Any] | None = cli_config
        """An optional dictionary of configuration settings passed at the command line
        that can be used to override configuration data loaded from file."""
        self.data: dict[str, Any]
        """A dictionary of the compiled configuration data from the provided data
        sources."""

        # Prohibit using neither paths and string or both paths and strings. Note that
        # these trap empty lists, so you have to provide _something_.
        if not (cfg_paths or cfg_strings):
            to_raise = ValueError("Provide cfg_paths or cfg_strings.")
            LOGGER.critical(to_raise)
            raise to_raise

        if cfg_paths and cfg_strings:
            to_raise = ValueError("Do not use both cfg_paths and cfg_strings.")
            LOGGER.critical(to_raise)
            raise to_raise

        # Standardise inputs and set from_cfg_strings
        if cfg_strings:
            # Standardise to a list of strings
            self.cfg_strings = (
                [cfg_strings] if isinstance(cfg_strings, str) else cfg_strings
            )
            self.from_cfg_strings = True

        if cfg_paths:
            # Standardise cfg_paths to list of Paths
            self.cfg_paths = (
                [Path(cfg_paths)]
                if isinstance(cfg_paths, str | Path)
                else [Path(p) for p in cfg_paths]
            )

        if autoload:
            self._load_data()
            self._compile_data()


[docs]
    def _load_data(self):
        """Load configuration data.

        This method loads configuration data from the sources set when the class
        instance was created.
        """
        if self.from_cfg_strings:
            # Load the TOML content
            self._load_config_toml_string()
        else:
            # Load the TOML content from resolved paths and resolve file paths
            # within configuration files.
            self._collect_config_paths()
            self._load_config_toml()
            self._resolve_config_file_paths()



[docs]
    def _compile_data(self):
        """Compile configuration data.

        This method compiles loaded configuration data into a single data dictionary,
        warning of conflicting or repeated settings across the sources.
        """

        data, conflicts = compile_configuration_data(list(self.toml_contents.values()))

        # Report on duplicated settings, sorting the conflicts to give stable ordering
        # in log reports and errors.
        if conflicts:
            to_raise = ConfigurationError(
                f"Duplicated entries in config files: {', '.join(sorted(conflicts))}",
            )
            LOGGER.critical(to_raise)
            raise to_raise

        # Enforce any configuration overrides passed in at the command line. Conflicts
        # are allowed here - although this mechanism can also be used to set
        # configuration options _not_ in the other sources - so do nothing about
        # conflicting settings
        if self.cli_config is not None:
            data, _ = merge_configuration_dicts(data, self.cli_config)

        self.data = data

        LOGGER.info("Configuration data compiled.")



[docs]
    def _collect_config_paths(self) -> None:
        """Collect TOML config files from provided paths.

        The :class:`ConfigurationLoader` class is initialised with a list of paths to
        either individual TOML config files or directories containing possibly multiple
        config files. This method examines that list to collect all the individual TOML
        config files in the provided locations and then populates the :attr:`toml_files`
        attribute.

        Raises:
            ConfigurationError: this is raised if any of the paths: do not exist, are
                directories that do not contain TOML files, are not TOML files or if the
                resolved files contain duplicate entries.
        """

        all_valid = True

        # Validate the paths
        for path in self.cfg_paths:
            if not path.exists():
                all_valid = False
                LOGGER.error(f"Config file path does not exist: {path}")
            elif path.is_dir():
                toml_in_dir = list(path.glob("*.toml"))
                if toml_in_dir:
                    self.toml_files.extend(toml_in_dir)
                else:
                    all_valid = False
                    LOGGER.error(
                        f"Config directory path contains no TOML files: {path}"
                    )
            elif path.is_file() and path.suffix != ".toml":
                all_valid = False
                LOGGER.error(f"Config file path with non-TOML suffix: {path}")
            else:
                self.toml_files.append(path)

        # Check that no files are resolved twice
        dupl_files = {
            str(md) for md in self.toml_files if self.toml_files.count(md) > 1
        }
        if dupl_files:
            all_valid = False
            LOGGER.error(f"Repeated files in config paths: {','.join(dupl_files)}")

        # Raise if there are any path errors
        if not all_valid:
            to_raise = ConfigurationError("Config paths not all valid: check log.")
            LOGGER.critical(to_raise)
            raise to_raise

        LOGGER.info(f"Config paths resolve to {len(self.toml_files)} files")



[docs]
    def _load_config_toml(self) -> None:
        """Load the contents of resolved configuration files.

        This method populates the :attr:`toml_contents` dictionary with the contents of
        the configuration files set in :attr:`toml_files`.

        Raises:
            ConfigurationError: Invalid TOML content in config files.
        """

        failed_inputs = False

        # Load the contents into the instance
        for this_file in self.toml_files:
            try:
                with open(this_file, "rb") as file_io:
                    self.toml_contents[this_file] = tomllib.load(file_io)
            except tomllib.TOMLDecodeError as err:
                failed_inputs = True
                LOGGER.error(f"Config TOML parsing error in {this_file}: {err!s}")
            else:
                LOGGER.info(f"Config TOML loaded from {this_file}")

        if failed_inputs:
            to_raise = ConfigurationError("Errors parsing config files: check log")
            LOGGER.critical(to_raise)
            raise to_raise



[docs]
    def _load_config_toml_string(self) -> None:
        """Load the contents of a config provided as a string.

        This method populates the :attr:`toml_contents` dictionary with the parsed
        contents of a provided TOML formatted string.

        Raises:
            ConfigurationError: Invalid TOML string.
        """

        for index, cfg_string in enumerate(self.cfg_strings):
            # Load the contents into the instance
            try:
                self.toml_contents[f"cfg_string_{index}"] = tomllib.loads(cfg_string)
            except tomllib.TOMLDecodeError as err:
                to_raise = ConfigurationError(
                    f"TOML parsing error in cfg_strings: {err!s}"
                )
                LOGGER.critical(to_raise)
                raise to_raise

        LOGGER.info("Config TOML loaded from config strings")



[docs]
    def _resolve_config_file_paths(self) -> None:
        """Resolve the locations of configured file paths.

        Configuration files can contain paths to other resources, such as the paths to
        files containing input data variables. These paths can be absolute, but may also
        be relative to the location of the configuration file itself. This method is
        used to resolve the location of files to the common root of the provided set of
        configuration files, typically the path where a simulation is started.
        """

        # Safeguard against running this when the toml_contents is from a cfg_string
        if self.from_cfg_strings:
            # TODO - how to resolve relative paths in cfg_string - niche use case
            LOGGER.warning("Config file paths not resolved with cfg_string")
            return

        for config_file, contents in self.toml_contents.items():
            if isinstance(config_file, Path):
                try:
                    _resolve_config_paths(
                        config_dir=config_file.parent, config_dict=contents
                    )
                except ValueError as excep:
                    LOGGER.critical(excep)
                    raise excep





[docs]
def build_configuration_model(
    requested_modules: list[str], requested_disturbances: list[str]
) -> type[CompiledConfiguration]:
    """Build a configuration model for a simulation.

    This function identifies the modules to be configured from the top-level
    configuration keys in a compiled configuration dictionary. It then registers the
    required modules to populate the module registry and to access the BaseModel and
    root configuration models for each requested model.

    The configuration models are then combined dynamically to give a single combined
    pydantic base model for the model elements requested for a given simulation. This is
    returned and can then be used to validate the data provided in the configuration
    files.

    The returned model class also provides the class variables ``_model_classes`` that
    provides a dictionary of the requested modules and their BaseModel instances.
    """

    # The core module is mandatory
    if "core" not in requested_modules:
        requested_modules = ["core", *requested_modules]

    # Register the requested modules, which handles unknown module names. This step is
    # required to populate the module registry with the details of the requested modules
    for module in requested_modules:
        module = (
            "virtual_ecosystem.core"
            if module == "core"
            else f"virtual_ecosystem.models.{module}"
        )
        register_module(module)

    # Register requested disturbances in the same way
    for disturbance in requested_disturbances:
        disturbance = f"virtual_ecosystem.disturbances.{disturbance}"
        register_disturbance(disturbance)

    # Create a list of submodels in the configuration.
    submodels = [
        (module, MODULE_REGISTRY[module].config) for module in requested_modules
    ]

    # And the same with the disturbances
    subdisturbance_models = [
        (disturbance, DISTURBANCE_REGISTRY[disturbance].config)
        for disturbance in requested_disturbances
    ]

    # Use pydantic create_model to dynamically generate a model with a field for each
    # requested module
    #  Mypy does not like this, but it seems to be used as intended:
    # https://docs.pydantic.dev/latest/concepts/models/#dynamic-model-creation
    # First the disturbances, if any
    if subdisturbance_models:
        combined_disturbance_model = create_model(
            "CompiledConfiguration",
            __base__=CompiledConfiguration,
            **{fname: (cname, cname()) for fname, cname in subdisturbance_models},
        )  # type: ignore[call-overload]
        # Populate the _model_classes class variable with the required dictionary of VE
        # BaseDisturbance models by requested model name.
        combined_disturbance_model._model_classes = {
            m: DISTURBANCE_REGISTRY[m].model for m in requested_disturbances
        }
        submodels.append(("disturbance", combined_disturbance_model))

    # And now, the normal models
    combined_model = create_model(
        "CompiledConfiguration",
        __base__=CompiledConfiguration,
        **{fname: (cname, cname()) for fname, cname in submodels},
    )  # type: ignore[call-overload]

    # Populate the _model_classes class variable with the required dictionary of VE
    # BaseModel science models by requested model name.
    combined_model._model_classes = {
        m: MODULE_REGISTRY[m].model for m in requested_modules if m != "core"
    }

    return combined_model




[docs]
def generate_configuration(
    data: dict[str, Any] = {}, context: Any | None = None
) -> CompiledConfiguration:
    """Generate a configuration model from configuration data.

    This method takes a dictionary of configuration data and tries to build a validated
    configuration model. The input data is typically loaded and compiled using the
    :class:`ConfigurationLoader` class.

    The first step is to take the root sections in the configuration data - indicating
    the various science models requested for a simulation - and uses those to build a
    composite configuration validator class.

    The provided data is then passed into the validator. If validation is successful
    then a validated configuration object is returned, otherwise the specific validation
    errors are written to the log and the function raises a :class`ConfigurationError`

    The pydantic validation process allows validation context to be passed to a
    validator object and this context is shared with daughter validators. At the moment,
    this is only used to pass path substitutions to validation.

    Args:
        data: A dictionary of unvalidated configuration data.
        context: Additional context to be passed to validation.
    """
    requested_modules = list(data.keys())
    if "disturbance" in requested_modules:
        requested_modules.remove("disturbance")

    # Build the configuration model from the compiled configuration
    try:
        ConfigurationModel = build_configuration_model(
            requested_modules=requested_modules,
            requested_disturbances=list(data.get("disturbance", {}).keys()),
        )
    except (ModuleNotFoundError, RuntimeError) as err:
        LOGGER.critical(str(err))
        raise

    LOGGER.info("Configuration model built.")

    try:
        configuration = ConfigurationModel().model_validate(data, context=context)
    except ValidationError as validation_errors:
        for error in validation_errors.errors():
            LOGGER.error(
                f"{'.'.join(str(x) for x in error['loc'])} = {error['input']}: "
                f"{error['msg']}"
            )
        LOGGER.critical("Configuration validation failed. See errors above.")
        raise ConfigurationError("Validation errors in configuration data - check log.")

    LOGGER.info("Configuration validated.")

    return configuration