Skip to content

Filter

filter_nulls

filter_nulls(self: DataFrame, *columns: ColumnReference, strict: bool = False, invert: bool = False) -> DataFrame

Keep all observations that represent null across any/all column(s).

Parameters:

Name Type Description Default
self DataFrame

Object inheriting from PySpark DataFrame.

required
*columns ColumnReference

Arbitrary number of column references. All columns must exist in self. If none are passed, all columns are used in filter.

()
strict bool

Should condition be true for all column(s)?

False
invert bool

Should observations that meet condition be kept (False) or removed (True)?

False

Returns:

Type Description
DataFrame

Observations that represent null across any/all column(s).

Source code in src/tidy_tools/core/filter.py
def filter_nulls(
    self: DataFrame,
    *columns: ColumnReference,
    strict: bool = False,
    invert: bool = False,
) -> DataFrame:  # numpydoc ignore=PR09
    """
    Keep all observations that represent null across any/all column(s).

    Parameters
    ----------
    self : DataFrame
        Object inheriting from PySpark DataFrame.
    *columns : ColumnReference
        Arbitrary number of column references. All columns must exist in `self`. If none
        are passed, all columns are used in filter.
    strict : bool
        Should condition be true for all column(s)?
    invert : bool
        Should observations that meet condition be kept (False) or removed (True)?

    Returns
    -------
    DataFrame
        Observations that represent null across any/all column(s).
    """
    query = construct_query(
        *columns or self.columns,
        predicate=_predicate.is_null,
        strict=strict,
        invert=invert,
    )
    return self.filter(query)

filter_substring

filter_substring(self: DataFrame, *columns: ColumnReference, substring: str, strict: bool = False, invert: bool = False) -> DataFrame

Keep all observations that match the regular expression across any/all column(s).

Parameters:

Name Type Description Default
self DataFrame

Object inheriting from PySpark DataFrame.

required
*columns ColumnReference

Arbitrary number of column references. All columns must exist in self. If none are passed, all columns are used in filter.

()
substring str

String expression to check.

required
strict bool

Should condition be true for all column(s)?

False
invert bool

Should observations that meet condition be kept (False) or removed (True)?

False

Returns:

Type Description
DataFrame

Observations that match the substring across any/all column(s).

Source code in src/tidy_tools/core/filter.py
def filter_substring(
    self: DataFrame,
    *columns: ColumnReference,
    substring: str,
    strict: bool = False,
    invert: bool = False,
) -> DataFrame:  # numpydoc ignore=PR09
    """
    Keep all observations that match the regular expression across any/all column(s).

    Parameters
    ----------
    self : DataFrame
        Object inheriting from PySpark DataFrame.
    *columns : ColumnReference
        Arbitrary number of column references. All columns must exist in `self`. If none
        are passed, all columns are used in filter.
    substring : str
        String expression to check.
    strict : bool
        Should condition be true for all column(s)?
    invert : bool
        Should observations that meet condition be kept (False) or removed (True)?

    Returns
    -------
    DataFrame
        Observations that match the substring across any/all column(s).
    """
    query = construct_query(
        *columns or self.columns,
        predicate=_predicate.is_substring,
        substring=substring,
        strict=strict,
        invert=invert,
    )
    return self.filter(query)

filter_regex

filter_regex(self: DataFrame, *columns: ColumnReference, pattern: str, strict: bool = False, invert: bool = False) -> DataFrame

Keep all observations that match the regular expression across any/all column(s).

Parameters:

Name Type Description Default
self DataFrame

Object inheriting from PySpark DataFrame.

required
*columns ColumnReference

Arbitrary number of column references. All columns must exist in self. If none are passed, all columns are used in filter.

()
pattern str

Regular expression. Must be compiled according to re library.

required
strict bool

Should condition be true for all column(s)?

False
invert bool

Should observations that meet condition be kept (False) or removed (True)?

False

Returns:

Type Description
DataFrame

Observations that match the regular expression across any/all column(s).

Source code in src/tidy_tools/core/filter.py
def filter_regex(
    self: DataFrame,
    *columns: ColumnReference,
    pattern: str,
    strict: bool = False,
    invert: bool = False,
) -> DataFrame:  # numpydoc ignore=PR09
    """
    Keep all observations that match the regular expression across any/all column(s).

    Parameters
    ----------
    self : DataFrame
        Object inheriting from PySpark DataFrame.
    *columns : ColumnReference
        Arbitrary number of column references. All columns must exist in `self`. If none
        are passed, all columns are used in filter.
    pattern : str
        Regular expression. Must be compiled according to `re` library.
    strict : bool
        Should condition be true for all column(s)?
    invert : bool
        Should observations that meet condition be kept (False) or removed (True)?

    Returns
    -------
    DataFrame
        Observations that match the regular expression across any/all column(s).
    """
    try:
        re.compile(pattern)
    except Exception as e:
        print(f"Cannot compile {pattern=} as regular expression. Raises: '{e}'")
    query = construct_query(
        *columns or self.columns,
        predicate=_predicate.is_regex_match,
        pattern=pattern,
        strict=strict,
        invert=invert,
    )
    return self.filter(query)

filter_elements

filter_elements(self: DataFrame, *columns: ColumnReference, elements: Sequence, strict: bool = False, invert: bool = False) -> DataFrame

Keep all observations that exist within elements across any/all column(s).

Parameters:

Name Type Description Default
self DataFrame

Object inheriting from PySpark DataFrame.

required
*columns ColumnReference

Arbitrary number of column references. All columns must exist in self. If none are passed, all columns are used in filter.

()
elements Sequence

Collection of items expected to exist in any/all column(s).

required
strict bool

Should condition be true for all column(s)?

False
invert bool

Should observations that meet condition be kept (False) or removed (True)?

False

Returns:

Type Description
DataFrame

Observations that exist within range across any/all column(s).

Source code in src/tidy_tools/core/filter.py
def filter_elements(
    self: DataFrame,
    *columns: ColumnReference,
    elements: Sequence,
    strict: bool = False,
    invert: bool = False,
) -> DataFrame:  # numpydoc ignore=PR09
    """
    Keep all observations that exist within elements across any/all column(s).

    Parameters
    ----------
    self : DataFrame
        Object inheriting from PySpark DataFrame.
    *columns : ColumnReference
        Arbitrary number of column references. All columns must exist in `self`. If none
        are passed, all columns are used in filter.
    elements : Sequence
        Collection of items expected to exist in any/all column(s).
    strict : bool
        Should condition be true for all column(s)?
    invert : bool
        Should observations that meet condition be kept (False) or removed (True)?

    Returns
    -------
    DataFrame
        Observations that exist within range across any/all column(s).
    """
    query = construct_query(
        *columns or self.columns,
        predicate=_predicate.is_member,
        elements=elements,
        strict=strict,
        invert=invert,
    )
    return self.filter(query)

filter_range

filter_range(self: DataFrame, *columns: ColumnReference, boundaries: Sequence[Any], strict: bool = False, invert: bool = False) -> DataFrame

Keep all observations that exist within range across any/all column(s).

Parameters:

Name Type Description Default
self DataFrame

Object inheriting from PySpark DataFrame.

required
*columns ColumnReference

Arbitrary number of column references. All columns must exist in self. If none are passed, all columns are used in filter.

()
boundaries Sequence[Any]

Bounds of range. Must be of same type and in ascending order.

required
strict bool

Should condition be true for all column(s)?

False
invert bool

Should observations that meet condition be kept (False) or removed (True)?

False

Returns:

Type Description
DataFrame

Observations that exist within range across any/all column(s).

Raises:

Type Description
AssertionError

Raises error if either condition is not met: - lower_bound is not same type as upper_bound - lower_bound is greater than or equal to upper_bound.

Source code in src/tidy_tools/core/filter.py
def filter_range(
    self: DataFrame,
    *columns: ColumnReference,
    boundaries: Sequence[Any],
    strict: bool = False,
    invert: bool = False,
) -> DataFrame:  # numpydoc ignore=PR09
    """
    Keep all observations that exist within range across any/all column(s).

    Parameters
    ----------
    self : DataFrame
        Object inheriting from PySpark DataFrame.
    *columns : ColumnReference
        Arbitrary number of column references. All columns must exist in `self`. If none
        are passed, all columns are used in filter.
    boundaries : Sequence[Any]
        Bounds of range. Must be of same type and in ascending order.
    strict : bool
        Should condition be true for all column(s)?
    invert : bool
        Should observations that meet condition be kept (False) or removed (True)?

    Returns
    -------
    DataFrame
        Observations that exist within range across any/all column(s).

    Raises
    ------
    AssertionError
        Raises error if either condition is not met:
            - `lower_bound` is not same type as `upper_bound`
            - `lower_bound` is greater than or equal to `upper_bound`.
    """
    try:
        lower_bound, upper_bound = boundaries
        assert type(lower_bound) is type(upper_bound)
        assert lower_bound < upper_bound
    except AssertionError:
        raise AssertionError(
            f"Boundaries must be same type and in ascending order. Received ({lower_bound=} ({type(lower_bound)}), {upper_bound=} ({type(upper_bound)}))"
        )
    query = construct_query(
        *columns or self.columns,
        predicate=_predicate.is_between,
        boundaries=boundaries,
        strict=strict,
        invert=invert,
    )
    return self.filter(query)

filter_custom

filter_custom(self: DataFrame, *columns: ColumnReference, predicate: Callable, strict: bool = False, invert: bool = False, **kwargs: dict) -> DataFrame

Keep all observations that match the regular expression across any/all column(s).

Parameters:

Name Type Description Default
self DataFrame

Object inheriting from PySpark DataFrame.

required
*columns ColumnReference

Arbitrary number of column references. All columns must exist in self. If none are passed, all columns are used in filter.

()
predicate Callable

Function returning PySpark Column for filtering expression.

required
strict bool

Should condition be true for all column(s)?

False
invert bool

Should observations that meet condition be kept (False) or removed (True)?

False
**kwargs dict

Additional options to pass to predicate.

{}

Returns:

Type Description
DataFrame

Observations that match the substring across any/all column(s).

Source code in src/tidy_tools/core/filter.py
def filter_custom(
    self: DataFrame,
    *columns: ColumnReference,
    predicate: Callable,
    strict: bool = False,
    invert: bool = False,
    **kwargs: dict,
) -> DataFrame:  # numpydoc ignore=PR09
    """
    Keep all observations that match the regular expression across any/all column(s).

    Parameters
    ----------
    self : DataFrame
        Object inheriting from PySpark DataFrame.
    *columns : ColumnReference
        Arbitrary number of column references. All columns must exist in `self`. If none
        are passed, all columns are used in filter.
    predicate : Callable
        Function returning PySpark Column for filtering expression.
    strict : bool
        Should condition be true for all column(s)?
    invert : bool
        Should observations that meet condition be kept (False) or removed (True)?
    **kwargs : dict, optional
        Additional options to pass to `predicate`.

    Returns
    -------
    DataFrame
        Observations that match the substring across any/all column(s).
    """
    query = construct_query(
        *columns or self.columns,
        predicate=predicate,
        strict=strict,
        invert=invert,
        **kwargs,
    )
    return self.filter(query)