Source code for ijazz.categorize

import numpy as np
import pandas as pd

[docs] def categorize(df: pd.DataFrame, category_dict: dict, cut="", prefix='cat', var_prefixes=None, var_suffixes=['1','2']) -> np.ndarray: """Create the categories for both lepton based on a dataframe containing the categorisation variable per lepton under the form var1, var2 for lepton 1 and lepton 2, the name of the variables to categorize upon are specified in the the category dictionnary along with the corresponding bining. Args: df (pd.DataFrame): input dataframe containing the variable to categorize category_dict (dict): dictionary for categorisation, e.g. {'pt': [25, 50, 100], 'abs_eta': [0, 1, 2]} cut (str, optional): cut to be use if df.eval(cut) to apply a selection. Defaults to "". prefix (str, optional): prefix used for the categorisation. Defaults to 'cat'. Returns: np.ndarray: array with the category numbers that have been created """ n_cat_bins = [len(bin)-1 for bin in category_dict.values()] categories = np.arange(np.prod(n_cat_bins)).reshape(*n_cat_bins) selection = True idx = slice(0, None) if cut == "" else df.eval(cut) if var_prefixes is None: var_prefixes = ['',''] else: var_suffixes = ['',''] for name, bins in category_dict.items(): for ele_suffix, ele_prefix in zip(var_suffixes, var_prefixes): df[f'{prefix}_{ele_prefix}{name}{ele_suffix}'] = np.int32(-1) df.loc[idx, f'{prefix}_{ele_prefix}{name}{ele_suffix}'] = (np.digitize(df.loc[idx, f"{ele_prefix}{name}{ele_suffix}"], bins) - 1).astype(np.int32) selection &= (df[f'{prefix}_{ele_prefix}{name}{ele_suffix}'] >= 0) & (df[f'{prefix}_{ele_prefix}{name}{ele_suffix}'] < len(bins) - 1) for i_ele, (ele_suffix, ele_prefix) in enumerate(zip(var_suffixes, var_prefixes)): i_ele += 1 df[f'{prefix}{i_ele}'] = -1 df.loc[selection, f'{prefix}{i_ele}'] = categories[tuple([df.loc[selection, f"{prefix}_{ele_prefix}{name}{ele_suffix}"] for name in category_dict.keys()])] df[f'{prefix}{i_ele}'] = df[f'{prefix}{i_ele}'].astype(np.int32) return categories