Skip to content

vayu.pandas_utils

Requires the vayulib[data] extra.

pandas_utils

concat_frame_from_dir

concat_frame_from_dir(
    path,
    prefix: str = None,
    extension="parquet",
    progress=False,
) -> DataFrame

Concatenate all dataframes in a directory

Source code in vayu/pandas_utils.py
def concat_frame_from_dir(
    path, prefix: str = None, extension="parquet", progress=False
) -> pd.DataFrame:
    """Concatenate all dataframes in a directory"""

    path = Path(path)
    files = path.glob(f"*.{extension}")
    if prefix is not None:
        files = [f for f in files if f.name.startswith(prefix)]
    files = sorted(files, key=attrgetter("name"))
    reader_func = None
    match extension:
        case "feather":
            reader_func = pd.read_feather
        case "csv":
            reader_func = pd.read_csv
        case "pickle":
            reader_func = pd.read_pickle
        case "parquet":
            reader_func = pd.read_parquet

    frags = []
    n = len(files)
    for i, f in enumerate(files):
        frags.append(reader_func(f))
        if progress:
            print(f"Finished reading [{i + 1}/{n}] {f.name}")

    return pd.concat([f for f in frags if not f.empty], ignore_index=True)

slice_frame

slice_frame(
    interval: Interval,
    df: DataFrame,
    level: Optional[int] = None,
    key: Optional[Hashable] = None,
    axis: int = 0,
    exclude: bool = False,
) -> DataFrame

Slice a dataframe using this interval.

Parameters:

Name Type Description Default
interval Interval

The interval to slice

required
df DataFrame

The dataframe to slice.

required
level Optional[int]

The index or column level to slice on

None
key Optional[Hashable]

The row index or column to slice on to slice on

None
axis int

The axis to slice on (0 for index, 1 for columns)

0
exclude bool

If True, exclude the interval instead of including it

False
Notes
  • If key is not specified, the level (or index if level is None) should be sorted in increasing order
  • If neither key nor level is specified, the 0th level of index is used for sliced on.
Source code in vayu/pandas_utils.py
def slice_frame(
    interval: Interval,
    df: "pd.DataFrame",
    level: Optional[int] = None,
    key: Optional[Hashable] = None,
    axis: int = 0,
    exclude: bool = False,
) -> "pd.DataFrame":
    """Slice a dataframe using this interval.

    Args:
        interval: The interval to slice
        df: The dataframe to slice.
        level: The index or column level to slice on
        key: The row index or column to slice on to slice on
        axis: The axis to slice on (0 for index, 1 for columns)
        exclude: If True, exclude the interval instead of including it

    Notes:
        - If key is not specified, the level (or index if level is None) should be sorted in increasing order
        - If neither key nor level is specified, the 0th level of index is used for sliced on.
    """
    _validate_libraries()
    if level is not None:
        slicer = [slice(None)] * df.index.nlevels
        slicer[level] = slice(interval.start, interval.end)
        slicer = pd.IndexSlice[tuple(slicer)]
    elif key is not None:
        key = df[key] if axis == 0 else df.loc[key]
        slicer = (interval.start <= key) & (key <= interval.end)
    else:
        slicer = pd.IndexSlice[interval.start : interval.end]

    if isinstance(df, pd.DataFrame):
        sliced = df.loc[slicer, :] if axis == 0 else df.loc[:, slicer]
        if exclude:
            sliced = (
                df.loc[df.index.difference(sliced.index), :]
                if axis == 0
                else df.loc[:, df.columns.difference(sliced.columns)]
            )
    else:
        sliced = df.loc[slicer]
        if exclude:
            sliced = df.loc[df.index.difference(sliced.index)]

    return sliced

is_frame_empty

is_frame_empty(
    df: Optional[Union[DataFrame, Series]],
) -> bool

Check if a dataframe is empty.

Source code in vayu/pandas_utils.py
def is_frame_empty(df: Optional[Union["pd.DataFrame", "pd.Series"]]) -> bool:
    """Check if a dataframe is empty."""
    return df is None or len(df) == 0

get_frame_window

get_frame_window(
    df: DataFrame, column: str = None, level: int = 0
) -> Optional[TimeWindow]

Get the time window of a dataframe.

Parameters:

Name Type Description Default
df DataFrame

dataframe

required
column str

If specified, the window is computed from the min and max of the column.

None
level int

Window is computed from the min and max of the index at the specified level.

0
Source code in vayu/pandas_utils.py
def get_frame_window(
    df: "pd.DataFrame", column: str = None, level: int = 0
) -> Optional[TimeWindow]:
    """Get the time window of a dataframe.

    Args:
        df: dataframe
        column: If specified, the window is computed from the min and max of the column.
        level: Window is computed from the min and max of the index at the specified level.

    """
    _validate_libraries()
    if is_frame_empty(df):
        return None
    if column:
        start_time, end_time = df[column].min(), df[column].max()
    else:
        start_time, end_time = df.index.min(), df.index.max()
        if isinstance(start_time, tuple):
            start_time, end_time = start_time[level], end_time[level]

    return TimeWindow(from_timestamp(start_time.timestamp()), from_timestamp(end_time.timestamp()))

split_frame

split_frame(
    obj: Union[Series, DataFrame],
    n: Optional[Union[int, float, datetime]] = 0.5,
) -> Union[
    Tuple[Series, Series], Tuple[DataFrame, DataFrame]
]

Split a dataframe or series into two parts.

Parameters:

Name Type Description Default
obj Union[Series, DataFrame]

The object to split

required
n Optional[Union[int, float, datetime]]

The index to split at. If float, it is treated as a fraction of the length of the object. If int, it is treated as an index. If datetime, it is treated as a timestamp.

0.5

Returns:

Type Description
Union[Tuple[Series, Series], Tuple[DataFrame, DataFrame]]

A tuple of two objects, the first part and the second part.

Source code in vayu/pandas_utils.py
def split_frame(
    obj: Union["pd.Series", "pd.DataFrame"], n: Optional[Union[int, float, datetime]] = 0.5
) -> Union[Tuple["pd.Series", "pd.Series"], Tuple["pd.DataFrame", "pd.DataFrame"]]:
    """Split a dataframe or series into two parts.

    Args:
        obj: The object to split
        n: The index to split at. If float, it is treated as a fraction of the length of the object.
            If int, it is treated as an index.
            If datetime, it is treated as a timestamp.

    Returns:
        A tuple of two objects, the first part and the second part.
    """
    _validate_libraries()
    if isinstance(n, float):
        assert abs(n) <= 1, "Float split index should be less than equal to 1"
        n = math.floor(len(obj) * n)
    if isinstance(n, int):
        return obj.iloc[:n], obj.iloc[n:]
    else:
        return obj.loc[:n], obj.loc[n:]

select_frame

select_frame(frame: DataFrame, **conditions)

Select rows from a dataframe based on conditions.

Source code in vayu/pandas_utils.py
def select_frame(frame: "pd.DataFrame", **conditions):
    """Select rows from a dataframe based on conditions."""
    _validate_libraries()
    for key, condition in conditions.items():
        parts = key.split("__")
        if len(parts) == 1:
            parts.append("eq")
        key, op = parts
        if isinstance(condition, Interval):
            frame = slice_frame(condition, frame, key=key, exclude=op in ("neq", "ne"))
        else:
            frame = frame[_selector(frame[key], op, condition)]
    return frame