You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
5.8 KiB
133 lines
5.8 KiB
1 year ago
|
"""
|
||
|
This type stub file was generated by pyright.
|
||
|
"""
|
||
|
|
||
|
"""Statistical transformations for visualization.
|
||
|
|
||
|
This module is currently private, but is being written to eventually form part
|
||
|
of the public API.
|
||
|
|
||
|
The classes should behave roughly in the style of scikit-learn.
|
||
|
|
||
|
- All data-independent parameters should be passed to the class constructor.
|
||
|
- Each class should impelment a default transformation that is exposed through
|
||
|
__call__. These are currently written for vector arguements, but I think
|
||
|
consuming a whole `plot_data` DataFrame and return it with transformed
|
||
|
variables would make more sense.
|
||
|
- Some class have data-dependent preprocessing that should be cached and used
|
||
|
multiple times (think defining histogram bins off all data and then counting
|
||
|
observations within each bin multiple times per data subsets). These currently
|
||
|
have unique names, but it would be good to have a common name. Not quite
|
||
|
`fit`, but something similar.
|
||
|
- Alternatively, the transform interface could take some information about grouping
|
||
|
variables and do a groupby internally.
|
||
|
- Some classes should define alternate transforms that might make the most sense
|
||
|
with a different function. For example, KDE usually evaluates the distribution
|
||
|
on a regular grid, but it would be useful for it to transform at the actual
|
||
|
datapoints. Then again, this could be controlled by a parameter at the time of
|
||
|
class instantiation.
|
||
|
|
||
|
"""
|
||
|
class KDE:
|
||
|
"""Univariate and bivariate kernel density estimator."""
|
||
|
def __init__(self, *, bw_method=..., bw_adjust=..., gridsize=..., cut=..., clip=..., cumulative=...) -> None:
|
||
|
"""Initialize the estimator with its parameters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
bw_method : string, scalar, or callable, optional
|
||
|
Method for determining the smoothing bandwidth to use; passed to
|
||
|
:class:`scipy.stats.gaussian_kde`.
|
||
|
bw_adjust : number, optional
|
||
|
Factor that multiplicatively scales the value chosen using
|
||
|
``bw_method``. Increasing will make the curve smoother. See Notes.
|
||
|
gridsize : int, optional
|
||
|
Number of points on each dimension of the evaluation grid.
|
||
|
cut : number, optional
|
||
|
Factor, multiplied by the smoothing bandwidth, that determines how
|
||
|
far the evaluation grid extends past the extreme datapoints. When
|
||
|
set to 0, truncate the curve at the data limits.
|
||
|
clip : pair of numbers None, or a pair of such pairs
|
||
|
Do not evaluate the density outside of these limits.
|
||
|
cumulative : bool, optional
|
||
|
If True, estimate a cumulative distribution function.
|
||
|
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def define_support(self, x1, x2=..., weights=..., cache=...): # -> NDArray[floating[Any]] | tuple[NDArray[floating[Any]], NDArray[floating[Any]]]:
|
||
|
"""Create the evaluation grid for a given data set."""
|
||
|
...
|
||
|
|
||
|
def __call__(self, x1, x2=..., weights=...): # -> tuple[NDArray[Unknown] | Unknown, NDArray[floating[Any]] | tuple[NDArray[floating[Any]], NDArray[floating[Any]]]] | tuple[NDArray[float64] | Unknown, NDArray[floating[Any]] | tuple[NDArray[floating[Any]], NDArray[floating[Any]]]]:
|
||
|
"""Fit and evaluate on univariate or bivariate data."""
|
||
|
...
|
||
|
|
||
|
|
||
|
|
||
|
class Histogram:
|
||
|
"""Univariate and bivariate histogram estimator."""
|
||
|
def __init__(self, stat=..., bins=..., binwidth=..., binrange=..., discrete=..., cumulative=...) -> None:
|
||
|
"""Initialize the estimator with its parameters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
stat : {"count", "frequency", "density", "probability"}
|
||
|
Aggregate statistic to compute in each bin.
|
||
|
|
||
|
- ``count`` shows the number of observations
|
||
|
- ``frequency`` shows the number of observations divided by the bin width
|
||
|
- ``density`` normalizes counts so that the area of the histogram is 1
|
||
|
- ``probability`` normalizes counts so that the sum of the bar heights is 1
|
||
|
|
||
|
bins : str, number, vector, or a pair of such values
|
||
|
Generic bin parameter that can be the name of a reference rule,
|
||
|
the number of bins, or the breaks of the bins.
|
||
|
Passed to :func:`numpy.histogram_bin_edges`.
|
||
|
binwidth : number or pair of numbers
|
||
|
Width of each bin, overrides ``bins`` but can be used with
|
||
|
``binrange``.
|
||
|
binrange : pair of numbers or a pair of pairs
|
||
|
Lowest and highest value for bin edges; can be used either
|
||
|
with ``bins`` or ``binwidth``. Defaults to data extremes.
|
||
|
discrete : bool or pair of bools
|
||
|
If True, set ``binwidth`` and ``binrange`` such that bin
|
||
|
edges cover integer values in the dataset.
|
||
|
cumulative : bool
|
||
|
If True, return the cumulative statistic.
|
||
|
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def define_bin_edges(self, x1, x2=..., weights=..., cache=...): # -> NDArray[Any] | tuple[Unknown, ...]:
|
||
|
"""Given data, return the edges of the histogram bins."""
|
||
|
...
|
||
|
|
||
|
def __call__(self, x1, x2=..., weights=...): # -> tuple[Any | ndarray[Any, Any] | NDArray[float64] | NDArray[Any], Unknown | NDArray[Any] | tuple[Unknown, ...]] | tuple[Unknown, Unknown | NDArray[Any] | tuple[Unknown, ...]]:
|
||
|
"""Count the occurrances in each bin, maybe normalize."""
|
||
|
...
|
||
|
|
||
|
|
||
|
|
||
|
class ECDF:
|
||
|
"""Univariate empirical cumulative distribution estimator."""
|
||
|
def __init__(self, stat=..., complementary=...) -> None:
|
||
|
"""Initialize the class with its paramters
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
stat : {{"proportion", "count"}}
|
||
|
Distribution statistic to compute.
|
||
|
complementary : bool
|
||
|
If True, use the complementary CDF (1 - CDF)
|
||
|
|
||
|
"""
|
||
|
...
|
||
|
|
||
|
def __call__(self, x1, x2=..., weights=...): # -> tuple[Any, Any]:
|
||
|
"""Return proportion or count of observations below each sorted datapoint."""
|
||
|
...
|
||
|
|
||
|
|
||
|
|