Source code for virtual_ecosystem.core.config_builder

"""The :mod:`~virtual_ecosystem.core.config_builder` provides tools to load a set of
TOML formatted configuration dictionaries, either from files or from strings. String
inputs are primarily intended for use in configuring models for testing, where it is
more convenient to simply provide a string.

The main class :class:`ConfigurationLoader` handles the loading of configuration data
and compiling multiple sources into a single dictionary of configuration data.

The :func:`generate_configuration` function then:

* takes a compiled dictionary of configuration settings,
* assembles a pydantic validation model class using the configuration validators for
  each of the requested science modules, and
* passes the data through the validator to return a validated configuration model for
  the simulation.

Canonical usage patterns for the module would be:

.. code-block:: python

    config_data = ConfigurationLoader(...)
    config_object = generate_configuration(config_data.data)

"""  # noqa: D205

import tomllib
from collections.abc import Sequence
from copy import deepcopy
from pathlib import Path
from typing import Any

from pydantic import ValidationError, create_model

from virtual_ecosystem.core.configuration import CompiledConfiguration
from virtual_ecosystem.core.exceptions import ConfigurationError
from virtual_ecosystem.core.logger import LOGGER
from virtual_ecosystem.core.registry import (
    DISTURBANCE_REGISTRY,
    MODULE_REGISTRY,
    register_disturbance,
    register_module,
)


[docs] def merge_configuration_dicts( dest: dict, source: dict, **kwargs ) -> tuple[dict, set[str]]: """Recursively merge two configuration dictionaries. This function returns a copy of the input ``dest`` dictionary that has been extended recursively with the entries from the input ``source`` dictionary. The merging process looks for duplicated settings. In general, if two input dictionaries share complete key paths (that is a set of nested dictionary keys leading to a value) then that indicates a duplicated setting. The values might be identical, but the configuration files should not duplicate settings. When duplicated key paths are found, the value from the source dictionary is used and the function extends the returned ``conflicts`` set with the duplicated key path. However an exception is where both entries are lists - for example, resulting from a TOML array of tables (https://toml.io/en/v1.0.0#array-of-tables). In this case, it is reasonable to append the source values to the destination values. The motivating example here are `[[core.data.variable]]` entries, which can quite reasonably be split across configuration sources. Note that no attempt is made to check that the combined values are congruent - this is deferred to error handling when the configuration data is loaded. Args: dest: A dictionary to extend source: A dictionary of key value pairs to extend ``dest`` **kwargs: Additional arguments used in recursion Returns: A copy of dest, extended recursively with values from source, and a tuple of duplicate key paths. """ # Copy inputs to avoid mangling inputs dest = deepcopy(dest) source = deepcopy(source) # Populate conflicts and path from defaults or kwargs. These are not provided as # explicit arguments, because they would never really be used outside of recursion # and so are not really part of the API. conflicts: set = kwargs.get("conflicts", set()) path: str | None = kwargs.get("path", None) # Loop over the elements in the source dictionary for src_key, src_val in source.items(): # Get the source key from the dest dictionary and then check for three possible # outcomes of comparing dest_val and src_val dest_val = dest.get(src_key) if isinstance(dest_val, dict) and isinstance(src_val, dict): # Both values for this key are dictionaries, so recurse, extending the path next_path = src_key if path is None else f"{path}.{src_key}" dest[src_key], conflicts = merge_configuration_dicts( dest_val, src_val, conflicts=conflicts, path=next_path ) elif isinstance(dest_val, list) and isinstance(src_val, list): # Both values for this key are lists, so merge the lists dest[src_key] = [*dest_val, *src_val] elif dest_val is None: # The key is not currently in dest, so add the key value pair dest[src_key] = src_val else: # The key is in _both_, so override destval with srcval to keep processing, # but extend the conflicts set with the path to the conflicting key. dest[src_key] = src_val conflict_path = src_key if path is None else f"{path}.{src_key}" conflicts.add(conflict_path) # NOTE: Could extend here to check for dest_val == src_val and then ignore # duplicate matching definitions, but cleaner to just forbid overlap. return dest, conflicts
[docs] def compile_configuration_data(data: list[dict]) -> tuple[dict, set[str]]: """Compile a combined configuration multiple configuration dictionaries. This method sequentially merges configuration dictionaries, such as those loaded from multiple individual configuration files, into a single configuration dictionary. It returns the merged dictionary and a set of keys that have duplicated definitions in the input files. """ # Handle empty lists if len(data) == 0: LOGGER.warning("No config files set") return {}, set() # Just return the contents for a singleton list if len(data) == 1: return data[0], set() # Otherwise, merge other dicts into first compiled = data[0] for src in data[1:]: compiled, conflicts = merge_configuration_dicts(compiled, src) return compiled, conflicts
[docs] def _resolve_config_paths(config_dir: Path, config_dict: dict[str, Any]) -> None: """Resolve paths in a configuration file. Configuration files may contain keys providing file paths for data and other settings: these paths may be absolute but also could be relative to the specific configuration file. This becomes a problem when configurations are compiled across multiple configuration files, possibly in different locations, so this function searches the configuration dictionary loaded from a single file and updates configured relative paths to their absolute paths. At present, the configuration schema does not have an explicit mechanism to type a configuration option as being a path, so we currently use the `_path` suffix to indicate configuration options setting a path. So, this function recursively search a configuration file payload for values stored under keys ending in `_path` and resolves the paths. It does not attempt to resolve paths when the value starts with ``$`` as these are taken to be marker values for used in path substitution. Args: config_dir: A folder containing a configuration file. config_dict: A dictionary of contents of the configuration file, which may contain file paths to resolve. Raises: ValueError: if a key ending in ``_path`` has a non-string value. """ if not config_dir.is_absolute(): config_dir = config_dir.absolute() for key, item in config_dict.items(): if isinstance(item, dict): _resolve_config_paths(config_dir=config_dir, config_dict=item) elif isinstance(item, list): for list_entry in item: if isinstance(list_entry, dict): _resolve_config_paths(config_dir=config_dir, config_dict=list_entry) elif key.endswith("_path"): if not isinstance(item, str): raise ValueError( f"The value for config key '{key}' is not a string: {item}" ) # Do not resolve file markers if not item.startswith("$"): # Otherwise update the entry if the path is relative file_path = Path(item) if not file_path.is_absolute(): # The resolve method is used here because it is the only method to # resolve ../ entries from relative file paths and then the path is # made explicitly absolute file_resolved = (config_dir / file_path).resolve().absolute() config_dict[key] = str(file_resolved)
[docs] class ConfigurationLoader: """Configuration loading. The ``ConfigurationLoader`` class is used to load and compile configuration data for a Virtual Ecosystem simulation. Configuration data can be passed in as one of: * a list of paths to individual TOML configuration files or directories of TOML files (the ``cfg_paths`` argument) or * a list of TOML strings providing configuration data (the ``cfg_strings`` argument). In both cases, there is initial input validation of the argument values and then two data handling steps are run. Data loading ~~~~~~~~~~~~ The :meth:`_load_data` method handles the parsing of the TOML inputs. For configuration data passed as strings, this is largely checking that the data is valid TOML. For configuration data passed as paths, the following steps occur: * The :meth:`_collect_config_paths` method is used to compile a complete list of the individual TOML files to be used to build the configuration from the provided paths. * The :meth:`_load_config_toml` method is then used to parse the TOML content of each file, verifying that is valid TOML, and then store the parsed contents. * The :meth:`_resolve_config_file_paths` method is then used to to update file paths in configuration inputs to resolve them to absolute file paths. This is so that the file paths in the final compiled configuration data are all mutually resolvable, as the input files may use relative paths and do not necessarily all live in the same directory. At the end of this step, the :attr:`toml_contents` attribute will have been populated with individual parsed dictionaries of configuration data from each file or input string. Data compilation ~~~~~~~~~~~~~~~~ The :meth:`_compile_data` method is then run to compile the different individual dictionaries into a single configuration document. This method checks that configuration settings are uniquely set across the various configuration data sources. The :attr:`data` attribute then contains the complete compiled set of configuration data from the provided sources. Args: cfg_paths: A string, Path or list of strings or Paths giving configuration file or directory paths. cfg_strings: A string or list of strings containing TOML formatted configuration data. cli_config: Configuration settings provided by the user at the command line, used to override configuration settings in files. autoload: A boolean flag that can be used to turn off automatic data loading and compilation. """ def __init__( self, cfg_paths: str | Path | Sequence[str | Path] = [], cfg_strings: str | list[str] = [], cli_config: dict[str, Any] | None = None, autoload: bool = True, ) -> None: # Define attributes self.cfg_paths: list[Path] = [] """The configuration file paths, normalised from the cfg_paths argument.""" self.toml_files: list[str | Path] = [] """A list of TOML file paths resolved from the initial config paths.""" self.cfg_strings: list[str] = [] """A list of strings containing TOML content, provided by the ``cfg_strings`` argument.""" self.toml_contents: dict[str | Path, dict] = {} """A dictionary of the parsed TOML contents of config files or strings, keyed by file path or string index.""" self.merge_conflicts: list = [] """A list of configuration keys duplicated across configuration files.""" self.config_errors: list[tuple[str, Any]] = [] """Configuration errors, as a list of tuples of key path and error details.""" self.from_cfg_strings: bool = False """A boolean flag indicating whether paths or strings were used to create the instance.""" self.model_classes: dict[str, Any] = {} # FIXME: -> dict[str, Type[BaseModel]] """A dictionary of the model classes specified in the configuration, keyed by model name.""" self.cli_config: dict[str, Any] | None = cli_config """An optional dictionary of configuration settings passed at the command line that can be used to override configuration data loaded from file.""" self.data: dict[str, Any] """A dictionary of the compiled configuration data from the provided data sources.""" # Prohibit using neither paths and string or both paths and strings. Note that # these trap empty lists, so you have to provide _something_. if not (cfg_paths or cfg_strings): to_raise = ValueError("Provide cfg_paths or cfg_strings.") LOGGER.critical(to_raise) raise to_raise if cfg_paths and cfg_strings: to_raise = ValueError("Do not use both cfg_paths and cfg_strings.") LOGGER.critical(to_raise) raise to_raise # Standardise inputs and set from_cfg_strings if cfg_strings: # Standardise to a list of strings self.cfg_strings = ( [cfg_strings] if isinstance(cfg_strings, str) else cfg_strings ) self.from_cfg_strings = True if cfg_paths: # Standardise cfg_paths to list of Paths self.cfg_paths = ( [Path(cfg_paths)] if isinstance(cfg_paths, str | Path) else [Path(p) for p in cfg_paths] ) if autoload: self._load_data() self._compile_data()
[docs] def _load_data(self): """Load configuration data. This method loads configuration data from the sources set when the class instance was created. """ if self.from_cfg_strings: # Load the TOML content self._load_config_toml_string() else: # Load the TOML content from resolved paths and resolve file paths # within configuration files. self._collect_config_paths() self._load_config_toml() self._resolve_config_file_paths()
[docs] def _compile_data(self): """Compile configuration data. This method compiles loaded configuration data into a single data dictionary, warning of conflicting or repeated settings across the sources. """ data, conflicts = compile_configuration_data(list(self.toml_contents.values())) # Report on duplicated settings, sorting the conflicts to give stable ordering # in log reports and errors. if conflicts: to_raise = ConfigurationError( f"Duplicated entries in config files: {', '.join(sorted(conflicts))}", ) LOGGER.critical(to_raise) raise to_raise # Enforce any configuration overrides passed in at the command line. Conflicts # are allowed here - although this mechanism can also be used to set # configuration options _not_ in the other sources - so do nothing about # conflicting settings if self.cli_config is not None: data, _ = merge_configuration_dicts(data, self.cli_config) self.data = data LOGGER.info("Configuration data compiled.")
[docs] def _collect_config_paths(self) -> None: """Collect TOML config files from provided paths. The :class:`ConfigurationLoader` class is initialised with a list of paths to either individual TOML config files or directories containing possibly multiple config files. This method examines that list to collect all the individual TOML config files in the provided locations and then populates the :attr:`toml_files` attribute. Raises: ConfigurationError: this is raised if any of the paths: do not exist, are directories that do not contain TOML files, are not TOML files or if the resolved files contain duplicate entries. """ all_valid = True # Validate the paths for path in self.cfg_paths: if not path.exists(): all_valid = False LOGGER.error(f"Config file path does not exist: {path}") elif path.is_dir(): toml_in_dir = list(path.glob("*.toml")) if toml_in_dir: self.toml_files.extend(toml_in_dir) else: all_valid = False LOGGER.error( f"Config directory path contains no TOML files: {path}" ) elif path.is_file() and path.suffix != ".toml": all_valid = False LOGGER.error(f"Config file path with non-TOML suffix: {path}") else: self.toml_files.append(path) # Check that no files are resolved twice dupl_files = { str(md) for md in self.toml_files if self.toml_files.count(md) > 1 } if dupl_files: all_valid = False LOGGER.error(f"Repeated files in config paths: {','.join(dupl_files)}") # Raise if there are any path errors if not all_valid: to_raise = ConfigurationError("Config paths not all valid: check log.") LOGGER.critical(to_raise) raise to_raise LOGGER.info(f"Config paths resolve to {len(self.toml_files)} files")
[docs] def _load_config_toml(self) -> None: """Load the contents of resolved configuration files. This method populates the :attr:`toml_contents` dictionary with the contents of the configuration files set in :attr:`toml_files`. Raises: ConfigurationError: Invalid TOML content in config files. """ failed_inputs = False # Load the contents into the instance for this_file in self.toml_files: try: with open(this_file, "rb") as file_io: self.toml_contents[this_file] = tomllib.load(file_io) except tomllib.TOMLDecodeError as err: failed_inputs = True LOGGER.error(f"Config TOML parsing error in {this_file}: {err!s}") else: LOGGER.info(f"Config TOML loaded from {this_file}") if failed_inputs: to_raise = ConfigurationError("Errors parsing config files: check log") LOGGER.critical(to_raise) raise to_raise
[docs] def _load_config_toml_string(self) -> None: """Load the contents of a config provided as a string. This method populates the :attr:`toml_contents` dictionary with the parsed contents of a provided TOML formatted string. Raises: ConfigurationError: Invalid TOML string. """ for index, cfg_string in enumerate(self.cfg_strings): # Load the contents into the instance try: self.toml_contents[f"cfg_string_{index}"] = tomllib.loads(cfg_string) except tomllib.TOMLDecodeError as err: to_raise = ConfigurationError( f"TOML parsing error in cfg_strings: {err!s}" ) LOGGER.critical(to_raise) raise to_raise LOGGER.info("Config TOML loaded from config strings")
[docs] def _resolve_config_file_paths(self) -> None: """Resolve the locations of configured file paths. Configuration files can contain paths to other resources, such as the paths to files containing input data variables. These paths can be absolute, but may also be relative to the location of the configuration file itself. This method is used to resolve the location of files to the common root of the provided set of configuration files, typically the path where a simulation is started. """ # Safeguard against running this when the toml_contents is from a cfg_string if self.from_cfg_strings: # TODO - how to resolve relative paths in cfg_string - niche use case LOGGER.warning("Config file paths not resolved with cfg_string") return for config_file, contents in self.toml_contents.items(): if isinstance(config_file, Path): try: _resolve_config_paths( config_dir=config_file.parent, config_dict=contents ) except ValueError as excep: LOGGER.critical(excep) raise excep
[docs] def build_configuration_model( requested_modules: list[str], requested_disturbances: list[str] ) -> type[CompiledConfiguration]: """Build a configuration model for a simulation. This function identifies the modules to be configured from the top-level configuration keys in a compiled configuration dictionary. It then registers the required modules to populate the module registry and to access the BaseModel and root configuration models for each requested model. The configuration models are then combined dynamically to give a single combined pydantic base model for the model elements requested for a given simulation. This is returned and can then be used to validate the data provided in the configuration files. The returned model class also provides the class variables ``_model_classes`` that provides a dictionary of the requested modules and their BaseModel instances. """ # The core module is mandatory if "core" not in requested_modules: requested_modules = ["core", *requested_modules] # Register the requested modules, which handles unknown module names. This step is # required to populate the module registry with the details of the requested modules for module in requested_modules: module = ( "virtual_ecosystem.core" if module == "core" else f"virtual_ecosystem.models.{module}" ) register_module(module) # Register requested disturbances in the same way for disturbance in requested_disturbances: disturbance = f"virtual_ecosystem.disturbances.{disturbance}" register_disturbance(disturbance) # Create a list of submodels in the configuration. submodels = [ (module, MODULE_REGISTRY[module].config) for module in requested_modules ] # And the same with the disturbances subdisturbance_models = [ (disturbance, DISTURBANCE_REGISTRY[disturbance].config) for disturbance in requested_disturbances ] # Use pydantic create_model to dynamically generate a model with a field for each # requested module # Mypy does not like this, but it seems to be used as intended: # https://docs.pydantic.dev/latest/concepts/models/#dynamic-model-creation # First the disturbances, if any if subdisturbance_models: combined_disturbance_model = create_model( "CompiledConfiguration", __base__=CompiledConfiguration, **{fname: (cname, cname()) for fname, cname in subdisturbance_models}, ) # type: ignore[call-overload] # Populate the _model_classes class variable with the required dictionary of VE # BaseDisturbance models by requested model name. combined_disturbance_model._model_classes = { m: DISTURBANCE_REGISTRY[m].model for m in requested_disturbances } submodels.append(("disturbance", combined_disturbance_model)) # And now, the normal models combined_model = create_model( "CompiledConfiguration", __base__=CompiledConfiguration, **{fname: (cname, cname()) for fname, cname in submodels}, ) # type: ignore[call-overload] # Populate the _model_classes class variable with the required dictionary of VE # BaseModel science models by requested model name. combined_model._model_classes = { m: MODULE_REGISTRY[m].model for m in requested_modules if m != "core" } return combined_model
[docs] def generate_configuration( data: dict[str, Any] = {}, context: Any | None = None ) -> CompiledConfiguration: """Generate a configuration model from configuration data. This method takes a dictionary of configuration data and tries to build a validated configuration model. The input data is typically loaded and compiled using the :class:`ConfigurationLoader` class. The first step is to take the root sections in the configuration data - indicating the various science models requested for a simulation - and uses those to build a composite configuration validator class. The provided data is then passed into the validator. If validation is successful then a validated configuration object is returned, otherwise the specific validation errors are written to the log and the function raises a :class`ConfigurationError` The pydantic validation process allows validation context to be passed to a validator object and this context is shared with daughter validators. At the moment, this is only used to pass path substitutions to validation. Args: data: A dictionary of unvalidated configuration data. context: Additional context to be passed to validation. """ requested_modules = list(data.keys()) if "disturbance" in requested_modules: requested_modules.remove("disturbance") # Build the configuration model from the compiled configuration try: ConfigurationModel = build_configuration_model( requested_modules=requested_modules, requested_disturbances=list(data.get("disturbance", {}).keys()), ) except (ModuleNotFoundError, RuntimeError) as err: LOGGER.critical(str(err)) raise LOGGER.info("Configuration model built.") try: configuration = ConfigurationModel().model_validate(data, context=context) except ValidationError as validation_errors: for error in validation_errors.errors(): LOGGER.error( f"{'.'.join(str(x) for x in error['loc'])} = {error['input']}: " f"{error['msg']}" ) LOGGER.critical("Configuration validation failed. See errors above.") raise ConfigurationError("Validation errors in configuration data - check log.") LOGGER.info("Configuration validated.") return configuration