Source code for ijazz.config

import argparse
import sys
import yaml
import numpy as np
from pathlib import Path
from copy import deepcopy




[docs]
def merge_dicts(dict1, dict2):
    """Recursively merges dict2 into dict1."""
    for key, value in dict2.items():
        if isinstance(value, dict) and key in dict1:
            merge_dicts(dict1[key], value)
        else:
            if key == 'cut' and dict2.get('use_year_selection', True):
                dict1[key] = dict1.get(key, '') + ' and ' + value
            elif key =='use_year_selection':
                pass
            else:
                dict1[key] = value



[docs]
def ijazz_config_sas():
    """Entry point for the IJazZ Scale and Smearing fit configuration script.
    This script reads the provided configuration file and generates YAML files for each step.
    """
    parser = argparse.ArgumentParser(description=f'IJazZ Scale and Smearing fit')
    parser.add_argument('config', type=str, help='yaml config file')
    parser.add_argument('--cfg', type=str, default=None, help='path to the yaml config with steps')
    args = parser.parse_args(sys.argv[1:])

    with open(args.config, 'r') as fcfg:
        config = yaml.safe_load(fcfg)

    with open(args.cfg, 'r') as fcfg:
        cfg = yaml.safe_load(fcfg)
        
    config_sas(config, cfg)



[docs]
def config_sas(config: dict,cfg: dict):
    """Creates YAML configuration files for SAS steps 
    based on the provided `config` and `cfg` dictionaries. This function processes 
    datasets, applies cuts, and generates YAML files for each step in the configuration.
    
    Args:
        config (dict): A dictionary containing the main configuration. 
            Expected keys include:
            - 'datasets': List of dataset dictionaries with 'file_dt' and 'file_mc' keys.
            - 'dir_yaml': Directory path to save the generated YAML files.
            - 'sas': Dictionary containing SAS-specific configurations (e.g., 'cut').
            - 'object_type': A string representing the object type.
            - 'year': A string representing the year.
            - 'dir_results': Directory path for storing results.
        cfg (dict): A dictionary containing the SAS steps configuration. 
            Expected keys include:
            - 'steps': List of step dictionaries, each containing:
                - 'name': Name of the step.
                - 'split': Boolean indicating whether to split datasets.
                - 'sas': Dictionary with SAS-specific step configurations 
                    (e.g., 'correct_data').
    Returns:
        None: The function writes YAML files to the specified directory.
        
    Example:
        config = {
            'datasets': [{'file_dt': ['data1.parquet'], 'file_mc': ['mc1.parquet']}],
            'dir_yaml': './yaml_configs',
            'sas': {'cut': 'some_cut'},
            'object_type': 'Pho',
            'year': '2023',
            'dir_results': './results'
        }
        cfg = {
            'steps': [{'name': 'Step1', 'split': False, 'sas': {'correct_data': True}}]
        }
        config_sas(config, cfg)
    """
    datasets_sas = config['datasets']

    dir_yaml = Path(config.get('dir_yaml','.'))
    dir_yaml.mkdir(parents=True, exist_ok=True)

    sas = config.get('sas', None)
    corr_name = ''

    if config and (cut := sas.get('cut', None)):
        print(f'Applying cut: {cut}')
        cfg['sas']['cut'] = cut

    for i,step in enumerate(cfg['steps']):
        print(f'Processing step: {step["name"]}')
        # print(step['sas'].get('correct_data',True))
        split = step.get('split', False)
        for dataset in datasets_sas:
            dataset['file_dt'] = [file.replace('.parquet', f'.{corr_name}.parquet' if (i and cfg['steps'][max(0,i-1)]['sas'].get('correct_data',True)) else '.parquet') for file in dataset['file_dt']]
            
        if split:
            print('Used split datasets')
            datasets = datasets_sas
            
        else:
            files_dt = []
            files_mc = []
            for dataset in datasets_sas:
                files_dt += [dataset['file_dt']] if np.isscalar(dataset['file_dt']) else dataset['file_dt']
                files_mc += [dataset['file_mc']] if np.isscalar(dataset['file_mc']) else dataset['file_mc']
            
            
            datasets = [{"subyear": '','file_dt': files_dt, 'file_mc': files_mc}]
        
        for dataset in datasets:
            file_dt = [dataset['file_dt']] if np.isscalar(dataset['file_dt']) else dataset['file_dt']
            file_mc = [dataset['file_mc']] if np.isscalar(dataset['file_mc']) else dataset['file_mc']
    
            corr_name = config['object_type'] + step['name'] + 'Corr'
            cset_name = config['object_type'] + step['name']
            dset_name = config['year'] + dataset.get('subyear','')
            
            dir_results = Path(config['dir_results']) / step['name']
            

            config_step = deepcopy(cfg)
            merge_dicts(config_step, step)
            config_step['file_dt'] = file_dt
            config_step['file_mc'] = file_mc
            config_step['dir_results'] = str(dir_results)
            config_step['dset_name'] = dset_name
            config_step['cset_name'] = cset_name
            config_step.pop('steps', None)
            # print(config_step)

            
            name_yaml = f'sas_{step["name"]}{"_" if dataset.get("subyear","") else ""}{dataset.get("subyear","")}.yaml'
            print(f'Writing yaml file: {dir_yaml / name_yaml}')
            with open(dir_yaml / name_yaml, 'w') as yaml_file:
                yaml.dump(config_step, yaml_file, default_flow_style=False, sort_keys=False)