`vayu.pandas_utils`¶

Requires the vayulib[data] extra.

pandas_utils ¶

concat_frame_from_dir ¶

concat_frame_from_dir(
    path,
    prefix: str = None,
    extension="parquet",
    progress=False,
) -> DataFrame

Concatenate all dataframes in a directory

Source code in vayu/pandas_utils.py

def concat_frame_from_dir(
    path, prefix: str = None, extension="parquet", progress=False
) -> pd.DataFrame:
    """Concatenate all dataframes in a directory"""

    path = Path(path)
    files = path.glob(f"*.{extension}")
    if prefix is not None:
        files = [f for f in files if f.name.startswith(prefix)]
    files = sorted(files, key=attrgetter("name"))
    reader_func = None
    match extension:
        case "feather":
            reader_func = pd.read_feather
        case "csv":
            reader_func = pd.read_csv
        case "pickle":
            reader_func = pd.read_pickle
        case "parquet":
            reader_func = pd.read_parquet

    frags = []
    n = len(files)
    for i, f in enumerate(files):
        frags.append(reader_func(f))
        if progress:
            print(f"Finished reading [{i + 1}/{n}] {f.name}")

    return pd.concat([f for f in frags if not f.empty], ignore_index=True)

slice_frame ¶

slice_frame(
    interval: Interval,
    df: DataFrame,
    level: Optional[int] = None,
    key: Optional[Hashable] = None,
    axis: int = 0,
    exclude: bool = False,
) -> DataFrame

Slice a dataframe using this interval.

Parameters:

Name	Type	Description	Default
`interval`	`Interval`	The interval to slice	required
`df`	`DataFrame`	The dataframe to slice.	required
`level`	`Optional[int]`	The index or column level to slice on	`None`
`key`	`Optional[Hashable]`	The row index or column to slice on to slice on	`None`
`axis`	`int`	The axis to slice on (0 for index, 1 for columns)	`0`
`exclude`	`bool`	If True, exclude the interval instead of including it	`False`

Notes

If key is not specified, the level (or index if level is None) should be sorted in increasing order
If neither key nor level is specified, the 0th level of index is used for sliced on.

Source code in vayu/pandas_utils.py

def slice_frame(
    interval: Interval,
    df: "pd.DataFrame",
    level: Optional[int] = None,
    key: Optional[Hashable] = None,
    axis: int = 0,
    exclude: bool = False,
) -> "pd.DataFrame":
    """Slice a dataframe using this interval.

    Args:
        interval: The interval to slice
        df: The dataframe to slice.
        level: The index or column level to slice on
        key: The row index or column to slice on to slice on
        axis: The axis to slice on (0 for index, 1 for columns)
        exclude: If True, exclude the interval instead of including it

    Notes:
        - If key is not specified, the level (or index if level is None) should be sorted in increasing order
        - If neither key nor level is specified, the 0th level of index is used for sliced on.
    """
    _validate_libraries()
    if level is not None:
        slicer = [slice(None)] * df.index.nlevels
        slicer[level] = slice(interval.start, interval.end)
        slicer = pd.IndexSlice[tuple(slicer)]
    elif key is not None:
        key = df[key] if axis == 0 else df.loc[key]
        slicer = (interval.start <= key) & (key <= interval.end)
    else:
        slicer = pd.IndexSlice[interval.start : interval.end]

    if isinstance(df, pd.DataFrame):
        sliced = df.loc[slicer, :] if axis == 0 else df.loc[:, slicer]
        if exclude:
            sliced = (
                df.loc[df.index.difference(sliced.index), :]
                if axis == 0
                else df.loc[:, df.columns.difference(sliced.columns)]
            )
    else:
        sliced = df.loc[slicer]
        if exclude:
            sliced = df.loc[df.index.difference(sliced.index)]

    return sliced

is_frame_empty ¶

is_frame_empty(
    df: Optional[Union[DataFrame, Series]],
) -> bool

Check if a dataframe is empty.

Source code in vayu/pandas_utils.py

def is_frame_empty(df: Optional[Union["pd.DataFrame", "pd.Series"]]) -> bool:
    """Check if a dataframe is empty."""
    return df is None or len(df) == 0

get_frame_window ¶

get_frame_window(
    df: DataFrame, column: str = None, level: int = 0
) -> Optional[TimeWindow]

Get the time window of a dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe	required
`column`	`str`	If specified, the window is computed from the min and max of the column.	`None`
`level`	`int`	Window is computed from the min and max of the index at the specified level.	`0`

Source code in vayu/pandas_utils.py

def get_frame_window(
    df: "pd.DataFrame", column: str = None, level: int = 0
) -> Optional[TimeWindow]:
    """Get the time window of a dataframe.

    Args:
        df: dataframe
        column: If specified, the window is computed from the min and max of the column.
        level: Window is computed from the min and max of the index at the specified level.

    """
    _validate_libraries()
    if is_frame_empty(df):
        return None
    if column:
        start_time, end_time = df[column].min(), df[column].max()
    else:
        start_time, end_time = df.index.min(), df.index.max()
        if isinstance(start_time, tuple):
            start_time, end_time = start_time[level], end_time[level]

    return TimeWindow(from_timestamp(start_time.timestamp()), from_timestamp(end_time.timestamp()))

split_frame ¶

split_frame(
    obj: Union[Series, DataFrame],
    n: Optional[Union[int, float, datetime]] = 0.5,
) -> Union[
    Tuple[Series, Series], Tuple[DataFrame, DataFrame]
]

Split a dataframe or series into two parts.

Parameters:

Name	Type	Description	Default
`obj`	`Union[Series, DataFrame]`	The object to split	required
`n`	`Optional[Union[int, float, datetime]]`	The index to split at. If float, it is treated as a fraction of the length of the object. If int, it is treated as an index. If datetime, it is treated as a timestamp.	`0.5`

Returns:

Type	Description
`Union[Tuple[Series, Series], Tuple[DataFrame, DataFrame]]`	A tuple of two objects, the first part and the second part.

Source code in vayu/pandas_utils.py

def split_frame(
    obj: Union["pd.Series", "pd.DataFrame"], n: Optional[Union[int, float, datetime]] = 0.5
) -> Union[Tuple["pd.Series", "pd.Series"], Tuple["pd.DataFrame", "pd.DataFrame"]]:
    """Split a dataframe or series into two parts.

    Args:
        obj: The object to split
        n: The index to split at. If float, it is treated as a fraction of the length of the object.
            If int, it is treated as an index.
            If datetime, it is treated as a timestamp.

    Returns:
        A tuple of two objects, the first part and the second part.
    """
    _validate_libraries()
    if isinstance(n, float):
        assert abs(n) <= 1, "Float split index should be less than equal to 1"
        n = math.floor(len(obj) * n)
    if isinstance(n, int):
        return obj.iloc[:n], obj.iloc[n:]
    else:
        return obj.loc[:n], obj.loc[n:]

select_frame ¶

select_frame(frame: DataFrame, **conditions)

Select rows from a dataframe based on conditions.

Source code in vayu/pandas_utils.py

def select_frame(frame: "pd.DataFrame", **conditions):
    """Select rows from a dataframe based on conditions."""
    _validate_libraries()
    for key, condition in conditions.items():
        parts = key.split("__")
        if len(parts) == 1:
            parts.append("eq")
        key, op = parts
        if isinstance(condition, Interval):
            frame = slice_frame(condition, frame, key=key, exclude=op in ("neq", "ne"))
        else:
            frame = frame[_selector(frame[key], op, condition)]
    return frame

vayu.pandas_utils¶

pandas_utils ¶

concat_frame_from_dir ¶

slice_frame ¶

is_frame_empty ¶

get_frame_window ¶

split_frame ¶

select_frame ¶

`vayu.pandas_utils`¶