Source code for bootleg.utils.parser.parser_utils

"""
Bootleg parser utils.

Parses a Booleg input config into a DottedDict of config values (with
defaults filled in) for running a model.
"""

import argparse
import fileinput
import os

import ujson

import bootleg.utils.classes.comment_json as comment_json
from bootleg.utils.classes.dotted_dict import DottedDict, create_bool_dotted_dict
from bootleg.utils.parser.bootleg_args import config_args
from bootleg.utils.parser.emm_parse_args import (
    parse_args as emm_parse_args,
    parse_args_to_config as emm_parse_args_to_config,
)
from bootleg.utils.utils import load_yaml_file


[docs]def or_none(default):
    """Return or None function."""

    def func(x):
        # Convert "none" to proper None object
        if x.lower() == "none":
            return None
        # If default is None (and x is not None), return x without conversion as str
        elif default is None:
            return str(x)
        # Treat bools separately as bool("False") is true
        elif isinstance(default, bool):
            if x.lower() == "false":
                return False
            return True
        # Otherwise, default has non-None type; convert x to that type
        else:
            return type(default)(x)

    return func


[docs]def is_number(s):
    """Return True is string is a number."""
    try:
        float(s)
        return True
    except ValueError:
        return False


[docs]def is_json(value):
    """Return True if json."""
    # ujson is weird in that a string of a number is a dictionary; we don't want this
    if is_number(value):
        return False
    try:
        ujson.loads(value)
    except ValueError:
        return False
    return True


[docs]def recursive_keys(dictionary):
    """Recursively yields all keys of dict."""
    for key, value in dictionary.items():
        if type(value) is dict:
            yield key
            yield from recursive_keys(value)
        else:
            yield key


[docs]def merge_configs(config_l, config_r, new_config=None):
    """Merge two dotted dict configs."""
    if new_config is None:
        new_config = {}
    for k in config_l:
        # If unique to config_l or the same in both configs, add
        if k not in config_r or config_l[k] == config_r[k]:
            new_config[k] = config_l[k]
        # If not unique and different, then they must be dictionaries (that we can recursively merge)
        else:
            assert type(config_l[k]) in [dict, DottedDict] and type(config_r[k]) in [
                dict,
                DottedDict,
            ], f"You have two conflicting values for key {k}: {config_l[k]} vs {config_r[k]}"
            new_config[k] = merge_configs(config_l[k], config_r[k])
    for k in config_r:
        # If unique to config_r or the same in both configs, add
        if k not in config_l or config_l[k] == config_r[k]:
            new_config[k] = config_r[k]
    return new_config


[docs]def add_nested_flags_from_config(parser, config_dict, parser_hierarchy, prefix):
    """
    Add flags from config file, keeping the hierarchy the same.

    When a lower level is needed, parser.add_argument_group is called.
    Note, we append the parent key to the --param option (via prefix parameter).

    Args:
        parser: arg parser to add options to
        config_dict: raw config dictionary
        parser_hierarchy: Dict to add parser hierarhcy to
        prefix: prefix to add to arg parser
    """
    for param in config_dict:
        if isinstance(config_dict[param], dict):
            parser_hierarchy[param] = {}
            temp = parser.add_argument_group(f"Bootleg specific {param.split('_')[0]}")
            add_nested_flags_from_config(
                temp, config_dict[param], parser_hierarchy[param], f"{prefix}{param}."
            )
        else:
            default, description = config_dict[param]
            try:
                if isinstance(default, str) and is_json(default):
                    parser.add_argument(
                        f"--{prefix}{param}",
                        type=ujson.loads,
                        default=default,
                        help=description,
                    )
                elif isinstance(default, list):
                    if len(default) > 0:
                        # pass a list as argument
                        parser.add_argument(
                            f"--{prefix}{param}",
                            action="append",
                            type=type(default[0]),
                            default=default,
                            help=description,
                        )
                    else:
                        parser.add_argument(
                            f"--{prefix}{param}",
                            action="append",
                            default=default,
                            help=description,
                        )
                    parser_hierarchy["_global"] = parser
                else:
                    # pass
                    parser.add_argument(
                        f"--{prefix}{param}",
                        type=or_none(default),
                        default=default,
                        help=description,
                    )
                    parser_hierarchy["_global"] = parser
            except argparse.ArgumentError:
                print(
                    f"Could not add flag for param {param} because it was already present."
                )
    return


[docs]def flatten_nested_args_for_parser(args, new_args, groups, prefix):
    """Flatten all parameters to be passed as a single list to arg parse."""
    for key in args:
        if isinstance(args[key], dict):
            if key in groups:
                new_args = flatten_nested_args_for_parser(
                    args[key], new_args, groups, f"{prefix}{key}."
                )
            else:
                new_args.append(f"--{prefix}{key}")
                # print("HERE2", vars(args))
                if isinstance(args, dict):
                    new_args.append(f"{ujson.dumps(args[key])}")
                else:
                    new_args.append(f"{ujson.dumps(vars(args)[key])}")
        elif isinstance(args[key], list):
            for v in args[key]:
                new_args.append(f"--{prefix}{key}")
                if isinstance(v, dict):
                    new_args.append(f"{ujson.dumps(v)}")
                else:
                    new_args.append(f"{v}")
        else:
            new_args.append(f"--{prefix}{key}")
            new_args.append(f"{args[key]}")
    return new_args


[docs]def reconstructed_nested_args(args, names, parser_hierarchy, prefix):
    """Reconstruct the arguments and pass them to the necessary subparsers."""
    for key, sub_parser in parser_hierarchy.items():
        if isinstance(sub_parser, dict):
            names[key] = {}
            reconstructed_nested_args(args, names[key], sub_parser, f"{prefix}{key}.")
        else:
            sub_options = [action.dest for action in sub_parser._group_actions]
            sub_names = {
                name: value
                for (name, value) in args._get_kwargs()
                if name in sub_options
            }
            temp = argparse.Namespace(**sub_names)
            # remove the prefix from the key
            for k, v in temp.__dict__.items():
                names[k.replace(f"{prefix}", "")] = v
    return


[docs]def load_commented_json_file(file):
    """Load commented json file."""
    json_out = ""
    for line in fileinput.input(file):  # Read it all in
        json_out += line
    almost_json = comment_json.remove_comments(json_out)  # Remove comments
    proper_json = comment_json.remove_trailing_commas(
        almost_json
    )  # Remove trailing commas
    validated = ujson.loads(proper_json)  # We now have parseable JSON!
    return validated


[docs]def get_boot_config(config, parser_hierarchy=None, parser=None, unknown=None):
    """
    Return a parsed Bootleg config from config.

    Config can be a path to a config file or an already loaded dictionary.

    The high level work flow
       1. Reads Bootleg default config (config_args) and addes params to a arg parser,
          flattening all hierarchical values into "." values
               E.g., data_config -> word_embeddings -> layers becomes --data_config.word_embedding.layers
       2. Flattens the given config values into the "." format
       3. Adds any unknown values from the first arg parser that parses the config script.
          Allows the user to add --data_config.word_embedding.layers to command line that overwrite values in file
       4. Parses the flattened args w.r.t the arg parser
       5. Reconstruct the args back into their hierarchical form

    Args:
        config: model specific config
        parser_hierarchy: Dict of hierarchy of config (or None)
        parser: arg parser (or None)
        unknown: unknown arg values passed from command line to be added to config and overwrite values in file
    """
    if unknown is None:
        unknown = []
    if parser_hierarchy is None:
        parser_hierarchy = {}
    if parser is None:
        parser = argparse.ArgumentParser()

    add_nested_flags_from_config(parser, config_args, parser_hierarchy, prefix="")
    if type(config) is str:
        assert os.path.splitext(config)[1] in [
            ".json",
            ".yaml",
        ], "We only accept json or yaml ending for configs"
        if os.path.splitext(config)[1] == ".json":
            params = load_commented_json_file(config)
        else:
            params = load_yaml_file(config)
    else:
        assert (
            type(config) is dict
        ), "We only support loading configs that are paths to json/yaml files or preloaded configs."
        params = config
    all_keys = list(recursive_keys(parser_hierarchy))
    new_params = flatten_nested_args_for_parser(params, [], groups=all_keys, prefix="")
    # update with new args
    # unknown must have ["--arg1", "value1", "--arg2", "value2"] as we don't have any action_true args
    assert len(unknown) % 2 == 0
    assert all(
        unknown[idx].startswith(("-", "--")) for idx in range(0, len(unknown), 2)
    )
    for idx in range(1, len(unknown), 2):
        # allow passing -1 for emmental.device argument
        assert not unknown[idx].startswith(("-", "--")) or (
            unknown[idx - 1] == "--emmental.device" and unknown[idx] == "-1"
        )
    for idx in range(0, len(unknown), 2):
        arg = unknown[idx]
        # If override one you already have in json
        if arg in new_params:
            idx2 = new_params.index(arg)
            new_params[idx2 : idx2 + 2] = unknown[idx : idx + 2]
        # If override one that is in bootleg_args.py by not in json
        else:
            new_params.extend(unknown[idx : idx + 2])
    args = parser.parse_args(new_params)
    top_names = {}
    reconstructed_nested_args(args, top_names, parser_hierarchy, prefix="")
    # final_args = argparse.Namespace(**top_names)
    final_args = create_bool_dotted_dict(top_names)
    # turn_to_dotdicts(final_args)
    return final_args


[docs]def parse_boot_and_emm_args(config_script, unknown=None):
    """
    Merge the Emmental config with the Bootleg config.

    As we have an emmental: ... level in our config for emmental commands,
    we need to parse those with the Emmental parser and then merge the Bootleg only config values
    with the Emmental ones.

    Args:
        config_script: config script for Bootleg and Emmental args
        unknown: unknown arg values passed from command line to overwrite file values

    Returns: parsed merged Bootleg and Emmental config
    """
    if unknown is None:
        unknown = []
    config_parser = argparse.ArgumentParser(
        description="Bootleg Config",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    # Modified parse_args to have 'emmental.group' prefixes. This represents a hierarchy in our parser
    config_parser, parser_hierarchy = emm_parse_args(parser=config_parser)
    # Add Bootleg args and parse
    all_args = get_boot_config(config_script, parser_hierarchy, config_parser, unknown)
    # These have emmental -> config group -> arg structure for emmental.
    # Must remove that hierarchy to converte to internal Emmental hierarchy
    emm_args = {}
    for k, v in all_args["emmental"].items():
        emm_args[k] = v
    del all_args["emmental"]
    # create and add Emmental hierarchy
    config = emm_parse_args_to_config(create_bool_dotted_dict(emm_args))
    # Merge configs back (merge workds on dicts so must convert to dict first)
    config = create_bool_dotted_dict(merge_configs(all_args, config))
    return config