Skip to content

pandas_utils module

Utility functions for pandas:

  • bspd_print: pretty-prints a data frame
  • bspd_cross_products: generates cross-products of variables
  • bspd_statsdf: makes a dataframe with columns from an array specified column names.
  • bspd_prepareplot: prepares a dataframe for plotting (very specific).

bspd_cross_products(df, l1, l2=None, with_squares=True)

Returns a DataFrame with cross-products of the variables of df whose names are in l1 and l2.

Parameters:

Name Type Description Default
df DataFrame

any data frame

required
l1 list[str]

a list of names of variables that belong to df

required
l2 list[str] | None

ibidem; l1 by default

None
with_squares bool | None

if False, we drop the squares. True by default.

True

Returns:

Type Description
DataFrame

the data frame of cross-products with concatenated names.

Source code in bs_python_utils/pandas_utils.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def bspd_cross_products(
    df: pd.DataFrame,
    l1: list[str],
    l2: list[str] | None = None,
    with_squares: bool | None = True,
) -> pd.DataFrame:
    """Returns a DataFrame with cross-products of the variables of `df`
    whose names are in `l1` and `l2`.

    Args:
        df: any data frame
        l1: a list of names of variables that belong to `df`
        l2: ibidem; `l1` by default
        with_squares: if `False`, we drop the squares. `True` by default.

    Returns:
        the data frame of cross-products with concatenated names.
    """

    lp2 = l1 if l2 is None else l2
    l12 = list(product(l1, lp2))
    cross_pairs = [[x[0], x[1]] for x in l12 if x[0] != x[1]]
    unique_pairs = []
    for _i, c in enumerate(cross_pairs):
        print(c)
        c_ordered = c if c[0] < c[1] else list(reversed(c))
        print(c_ordered)
        if c_ordered not in unique_pairs:
            unique_pairs.append(c_ordered)
        print(unique_pairs)

    col_names = sorted([(x[0], x[1], f"{x[0]}*{x[1]}") for x in unique_pairs])

    if with_squares:
        col_names_squares = sorted(
            [(x[0], x[1], f"{x[0]}**2") for x in l12 if x[0] == x[1]]
        )
        col_names += col_names_squares

    df_cprods = pd.DataFrame(
        {col_name: df[x0] * df[x1] for (x0, x1, col_name) in col_names}
    )

    return df_cprods

bspd_prepareplot(df)

Parameters:

Name Type Description Default
df DataFrame

any dataframe whose column names either all end in '_n' for n an integer, or none does

required

Returns:

Type Description
DataFrame

a properly melted dataframe for plotting, with columns 'Sample', 'Statistic', 'Value',

DataFrame

and 'Group' if there are several integers.

Source code in bs_python_utils/pandas_utils.py
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def bspd_prepareplot(df: pd.DataFrame) -> pd.DataFrame:
    """
    Args:
        df: any dataframe whose column names either all end in '_n' for n an integer, or none does

    Returns:
        a properly melted dataframe for plotting, with columns 'Sample', 'Statistic', 'Value',
        and 'Group' if there are several integers.
    """
    # check the names of the columns
    values_integers = _check_names_n(df.columns)
    n_values_integers = len(values_integers)

    df2 = df.copy()
    df2["Sample"] = np.arange(df.shape[0])
    dfm = pd.melt(
        df2,
        id_vars="Sample",
        value_vars=list(df.columns),
        var_name="Statistic",
        value_name="Value",
    )
    if n_values_integers in [0, 1]:
        return dfm
    else:  # at least two different groups of statistics
        stat_group = dfm["Statistic"].str.split("_", n=1, expand=True)
        dfm.drop(columns=["Statistic"], inplace=True)
        dfm["Statistic"] = stat_group[0]
        dfm["Group"] = stat_group[1]
        return dfm

bspd_print(df, s='', max_rows=None, max_cols=None, precision=None)

Pretty-prints a data frame

Parameters:

Name Type Description Default
df DataFrame

any data frame

required
s str

an optional title string

''
max_rows int | None

maximum number of rows to print (all by default)

None
max_cols int | None

maximum number of columns to print (all by default)

None
precision int | None

of numbers. 3 digits by default.

None

Returns:

Type Description
None

nothing.

Source code in bs_python_utils/pandas_utils.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def bspd_print(
    df: pd.DataFrame,
    s: str = "",
    max_rows: int | None = None,
    max_cols: int | None = None,
    precision: int | None = None,
) -> None:
    """Pretty-prints a data frame

    Args:
        df: any data frame
        s: an optional title string
        max_rows: maximum number of rows to print (all by default)
        max_cols: maximum number of columns to print (all by default)
        precision: of numbers. 3 digits by default.

    Returns:
        nothing.
    """

    print_stars(s)
    with pd.option_context(
        "display.max_rows",
        max_rows,
        "display.max_columns",
        max_cols,
        "display.precision",
        precision,
    ):
        print(df)

bspd_statsdf(T, col_names)

Make a dataframe with columns from the array(s) in T and names from col_names.

Parameters:

Name Type Description Default
T ndarray | list[ndarray]

a list of n_T matrices or vectors with N rows, or a matrix or a vector with N rows

required
col_names str | list[str] | list[str | list[str]]

a list of n_T name objects; a name object must be a string or a list of strings, with the names for the column(s) of the corresponding T matrix

required

Returns:

Type Description
DataFrame

a dataframe with the named columns.

Source code in bs_python_utils/pandas_utils.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def bspd_statsdf(
    T: np.ndarray | list[np.ndarray],
    col_names: str | list[str] | list[str | list[str]],
) -> pd.DataFrame:
    """
    Make a dataframe with columns from the array(s) in `T` and names from `col_names`.

    Args:
        T: a list of n_T matrices or vectors with N rows, or a matrix or a vector with N rows
        col_names: a list of n_T name objects; a name object must be a string or a list of strings,
            with the names for the column(s) of the corresponding T matrix

    Returns:
        a dataframe with the named columns.
    """
    if isinstance(T, list):
        n_T = len(T)
        _check_colnames(col_names, n_T)
        shape_T = []
        for i in range(n_T):
            shape_T.append(T[i].shape)
        set_nrows = {shape_i[0] for shape_i in shape_T}
        if len(set_nrows) > 1:
            bs_error_abort("All T arrays should have the same number of rows.")
        big_T = T[0]
        big_names = _list_str(col_names[0], suffix="_1")
        for i in range(1, n_T):
            big_T = np.column_stack((big_T, T[i]))
            big_names.extend(_list_str(col_names[i], suffix=f"_{i+1}"))

        df = pd.DataFrame(big_T, columns=big_names, copy=True)

    else:  # only one element in T
        ndims_T = check_vector_or_matrix(T)
        if ndims_T == 1:
            if not isinstance(col_names, str):
                bs_error_abort(f"T is a vector but col_names is {col_names}")
            df = pd.DataFrame(T, columns=[col_names], copy=True)
        elif ndims_T == 2:
            N, K = T.shape
            K2 = len(col_names)
            if K2 != K:
                bs_error_abort(f"T is {T.shape} but col_names has {K2} elements")
            df = pd.DataFrame(T, columns=col_names, copy=True)

    return df