pandas_utils module¶

Utility functions for pandas:

bspd_print: pretty-prints a data frame
bspd_cross_products: generates cross-products of variables
bspd_statsdf: makes a dataframe with columns from an array specified column names.
bspd_prepareplot: prepares a dataframe for plotting (very specific).

`bspd_cross_products(df, l1, l2=None, with_squares=True)` ¶

Returns a DataFrame with cross-products of the variables of df whose names are in l1 and l2.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	any data frame	required
`l1`	`list[str]`	a list of names of variables that belong to `df`	required
`l2`	`list[str] \| None`	ibidem; `l1` by default	`None`
`with_squares`	`bool \| None`	if `False`, we drop the squares. `True` by default.	`True`

Returns:

Type	Description
`DataFrame`	the data frame of cross-products with concatenated names.

Source code in bs_python_utils/pandas_utils.py

def bspd_cross_products(
    df: pd.DataFrame,
    l1: list[str],
    l2: list[str] | None = None,
    with_squares: bool | None = True,
) -> pd.DataFrame:
    """Returns a DataFrame with cross-products of the variables of `df`
    whose names are in `l1` and `l2`.

    Args:
        df: any data frame
        l1: a list of names of variables that belong to `df`
        l2: ibidem; `l1` by default
        with_squares: if `False`, we drop the squares. `True` by default.

    Returns:
        the data frame of cross-products with concatenated names.
    """

    lp2 = l1 if l2 is None else l2
    l12 = list(product(l1, lp2))
    cross_pairs = [[x[0], x[1]] for x in l12 if x[0] != x[1]]
    unique_pairs = []
    for _i, c in enumerate(cross_pairs):
        print(c)
        c_ordered = c if c[0] < c[1] else list(reversed(c))
        print(c_ordered)
        if c_ordered not in unique_pairs:
            unique_pairs.append(c_ordered)
        print(unique_pairs)

    col_names = sorted([(x[0], x[1], f"{x[0]}*{x[1]}") for x in unique_pairs])

    if with_squares:
        col_names_squares = sorted(
            [(x[0], x[1], f"{x[0]}**2") for x in l12 if x[0] == x[1]]
        )
        col_names += col_names_squares

    df_cprods = pd.DataFrame(
        {col_name: df[x0] * df[x1] for (x0, x1, col_name) in col_names}
    )

    return df_cprods

`bspd_prepareplot(df)` ¶

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	any dataframe whose column names either all end in '_n' for n an integer, or none does	required

Returns:

Type	Description
`DataFrame`	a properly melted dataframe for plotting, with columns 'Sample', 'Statistic', 'Value',
`DataFrame`	and 'Group' if there are several integers.

Source code in bs_python_utils/pandas_utils.py

def bspd_prepareplot(df: pd.DataFrame) -> pd.DataFrame:
    """
    Args:
        df: any dataframe whose column names either all end in '_n' for n an integer, or none does

    Returns:
        a properly melted dataframe for plotting, with columns 'Sample', 'Statistic', 'Value',
        and 'Group' if there are several integers.
    """
    # check the names of the columns
    values_integers = _check_names_n(df.columns)
    n_values_integers = len(values_integers)

    df2 = df.copy()
    df2["Sample"] = np.arange(df.shape[0])
    dfm = pd.melt(
        df2,
        id_vars="Sample",
        value_vars=list(df.columns),
        var_name="Statistic",
        value_name="Value",
    )
    if n_values_integers in [0, 1]:
        return dfm
    else:  # at least two different groups of statistics
        stat_group = dfm["Statistic"].str.split("_", n=1, expand=True)
        dfm.drop(columns=["Statistic"], inplace=True)
        dfm["Statistic"] = stat_group[0]
        dfm["Group"] = stat_group[1]
        return dfm

`bspd_print(df, s='', max_rows=None, max_cols=None, precision=None)` ¶

Pretty-prints a data frame

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	any data frame	required
`s`	`str`	an optional title string	`''`
`max_rows`	`int \| None`	maximum number of rows to print (all by default)	`None`
`max_cols`	`int \| None`	maximum number of columns to print (all by default)	`None`
`precision`	`int \| None`	of numbers. 3 digits by default.	`None`

Returns:

Type	Description
`None`	nothing.

Source code in bs_python_utils/pandas_utils.py

def bspd_print(
    df: pd.DataFrame,
    s: str = "",
    max_rows: int | None = None,
    max_cols: int | None = None,
    precision: int | None = None,
) -> None:
    """Pretty-prints a data frame

    Args:
        df: any data frame
        s: an optional title string
        max_rows: maximum number of rows to print (all by default)
        max_cols: maximum number of columns to print (all by default)
        precision: of numbers. 3 digits by default.

    Returns:
        nothing.
    """

    print_stars(s)
    with pd.option_context(
        "display.max_rows",
        max_rows,
        "display.max_columns",
        max_cols,
        "display.precision",
        precision,
    ):
        print(df)

`bspd_statsdf(T, col_names)` ¶

Make a dataframe with columns from the array(s) in T and names from col_names.

Parameters:

Name	Type	Description	Default
`T`	`ndarray \| list[ndarray]`	a list of n_T matrices or vectors with N rows, or a matrix or a vector with N rows	required
`col_names`	`str \| list[str] \| list[str \| list[str]]`	a list of n_T name objects; a name object must be a string or a list of strings, with the names for the column(s) of the corresponding T matrix	required

Returns:

Type	Description
`DataFrame`	a dataframe with the named columns.

Source code in bs_python_utils/pandas_utils.py

def bspd_statsdf(
    T: np.ndarray | list[np.ndarray],
    col_names: str | list[str] | list[str | list[str]],
) -> pd.DataFrame:
    """
    Make a dataframe with columns from the array(s) in `T` and names from `col_names`.

    Args:
        T: a list of n_T matrices or vectors with N rows, or a matrix or a vector with N rows
        col_names: a list of n_T name objects; a name object must be a string or a list of strings,
            with the names for the column(s) of the corresponding T matrix

    Returns:
        a dataframe with the named columns.
    """
    if isinstance(T, list):
        n_T = len(T)
        _check_colnames(col_names, n_T)
        shape_T = []
        for i in range(n_T):
            shape_T.append(T[i].shape)
        set_nrows = {shape_i[0] for shape_i in shape_T}
        if len(set_nrows) > 1:
            bs_error_abort("All T arrays should have the same number of rows.")
        big_T = T[0]
        big_names = _list_str(col_names[0], suffix="_1")
        for i in range(1, n_T):
            big_T = np.column_stack((big_T, T[i]))
            big_names.extend(_list_str(col_names[i], suffix=f"_{i+1}"))

        df = pd.DataFrame(big_T, columns=big_names, copy=True)

    else:  # only one element in T
        ndims_T = check_vector_or_matrix(T)
        if ndims_T == 1:
            if not isinstance(col_names, str):
                bs_error_abort(f"T is a vector but col_names is {col_names}")
            df = pd.DataFrame(T, columns=[col_names], copy=True)
        elif ndims_T == 2:
            N, K = T.shape
            K2 = len(col_names)
            if K2 != K:
                bs_error_abort(f"T is {T.shape} but col_names has {K2} elements")
            df = pd.DataFrame(T, columns=col_names, copy=True)

    return df

pandas_utils module¶

bspd_cross_products(df, l1, l2=None, with_squares=True) ¶

bspd_prepareplot(df) ¶

bspd_print(df, s='', max_rows=None, max_cols=None, precision=None) ¶

bspd_statsdf(T, col_names) ¶

`bspd_cross_products(df, l1, l2=None, with_squares=True)` ¶

`bspd_prepareplot(df)` ¶

`bspd_print(df, s='', max_rows=None, max_cols=None, precision=None)` ¶

`bspd_statsdf(T, col_names)` ¶