Source code for regpyhdfe.utils

import numpy as np
import pandas as pd

[docs]def sklearn_to_df(sklearn_dataset):
    """Converts (as well as it can) an sklearn dataset to a Pandas dataframe.

    Args:
       sklearn_dataset (sklearn.utils.Bunch): this parameter is usually the result
            of using sklearn to quickly get a dataset, e.g. the object resulting
            from calling sklearn.load_datasets.load_boston().
    Returns:
        Pandas dataframe df where df['target'] is the target variable in the original
        dataset.
    """
    df = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    df['target'] = pd.Series(sklearn_dataset.target)
    return df

[docs]def add_intercept(X):
    """Prepends a column of 1s (an intercept column) to a a 2D numpy array.

    Args:
        X (numpy array): 2D numpy array.
    Returns:
        X with an appended column of 1s. 
    """
    # X has to be a 2D numpy array
    # prepends intercept
    intercept = np.ones(X.shape[0])
    return np.c_[intercept, X]

[docs]def get_np_columns(df, columns, intercept=False):
    """Helper used to retreive columns as numpy array.

    Args:
        df (pandas dataframe): dataframe containing desired columns
        columns (list of strings): list of names of desired columns.
                                    Must be a list even if only 1
                                    column is desired.
        intercept (bool): set to True if You'd like resulting numpy array
                            to have a column of 1s appended to it.
    Returns:
        2D numpy array with columns of array consisting of feature vectors,
        i.e. the first column of the result is a numpy vector of the first
        column named in columns argument.

    """
    # dataframe is a pandas datafram
    # columns is a list of column names
    # if intercept is true a column of 1s will be appended to the result matrix
    # returns columns as float64 matrix
    if columns == []:
        return None
    else:
        res = np.expand_dims(a=df[columns[0]].to_numpy().astype('float64'), axis=1)
        if len(columns) > 1:
            for name in columns[1:]:
                res = np.c_[res, np.expand_dims(a=df[name].to_numpy().astype('float64'), axis=1)]
        if intercept:
            res = add_intercept(res)
        return res