o
    0GfT                     @   sN  d dl mZmZmZ d dlmZ d dlmZ d dlm	Z	 d dl
Zd dlZer-d dlmZ ndd Zd d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZ dZ e!e d Z"dd Z#dGddZ$dGddZ%dGddZ&dGddZ'dGddZ(dGddZ)e$ej*ej+ej,ej-e%ej.e)e&e(e'dZ/d d! Z0d"d# Z1dGd$d%Z2d&Z3d'Z4d(d) e4D Z5e3e6e5 Z7G d*d+ d+Z8ee8j9Z:e:;d,edd-d.g e:;d/g  e:;d0d1gd2gfd3gd4gfg ee<e:	dHd5d5d6d7e d8d9d:eej=ej>ej?f d;e	e< d<e@d=e@d>eAd?e@d@e	eeBeAf  dAe@dBej?fdCdDZCG dEdF dFZDdS )I    )PD_LT_2Appenderis_numeric_dtype)SP_LT_19)Union)SequenceNis_categorical_dtypec                 C   s   t | tjS N)
isinstancepdZCategoricalDtypedtype r   Blib/python3.10/site-packages/statsmodels/stats/descriptivestats.pyr	      s   r	   )stats)SimpleTable)jarque_bera)cache_readonly)	Docstring	Parameter)
array_like	bool_like
float_likeint_like)	      
      2   K   Z   _   c   g      Y@c                 C   s   |   |   S r
   )maxmindfr   r   r   pd_ptp"      r(   c                 C   s   dt |  j|dS )Nr   axis)npZisnansum)xr+   r   r   r   nancount&   s   r/   c                 C   s   t j| |dt j| |d S Nr*   )r,   nanmaxnanminZarrr+   r   r   r   nanptp*   s   r4   c                 C   s   t j| d |dS )N   r*   )r,   Znansumr3   r   r   r   nanuss.   s   r6   c                 C   s   t j| t|dS r0   )r,   nanpercentilePERCENTILESr3   r   r   r   r7   2   r)   r7   c                 C      t j| |ddS NZomit)r+   Z
nan_policy)r   kurtosisr3   r   r   r   nankurtosis6   r)   r<   c                 C   r9   r:   )r   skewr3   r   r   r   nanskewness:   r)   r>   )Zobsmeanstdr$   r%   Zptpvarr=   Zussr;   percentilesc                 C   ,   zt | }W |S  ty   tj}Y |S w )zi
    wrapper for scipy.stats.kurtosis that returns nan instead of raising Error

    missing options
    )r   r;   
ValueErrorr,   nanaresr   r   r   	_kurtosisM      rI   c                 C   rC   )ze
    wrapper for scipy.stats.skew that returns nan instead of raising Error

    missing options
    )r   r=   rD   r,   rE   rF   r   r   r   _skewZ   rJ   rK   c                 C   s   t | } t | |k}t | |k }|| d }ztt|||| dj}W ||fS  tyC   tt|||| d}Y ||fS w )a8  
    Signs test

    Parameters
    ----------
    samp : array_like
        1d array. The sample for which you want to perform the sign test.
    mu0 : float
        See Notes for the definition of the sign test. mu0 is 0 by
        default, but it is common to set it to the median.

    Returns
    -------
    M
    p-value

    Notes
    -----
    The signs test returns

    M = (N(+) - N(-))/2

    where N(+) is the number of values above `mu0`, N(-) is the number of
    values below.  Values equal to `mu0` are discarded.

    The p-value for M is calculated using the binomial distribution
    and can be interpreted the same as for a t-test. The test-statistic
    is distributed Binom(min(N(+), N(-)), n_trials, .5) where n_trials
    equals N(+) + N(-).

    See Also
    --------
    scipy.stats.wilcoxon
    g       @      ?)	r,   asarrayr-   r   Z	binomtestr%   ZpvalueAttributeErrorZ
binom_test)ZsampZmu0posnegMpr   r   r   	sign_testg   s   
#rS   )nobsmissingr?   std_errcir@   iqr
iqr_normalmad
mad_normalcoef_varranger$   r%   r=   r;   r   modemedianrB   )rT   rU   distincttopfreqc                 C      g | ]}|t vr|qS r   )NUMERIC_STATISTICS.0statr   r   r   
<listcomp>   s    rh   c                   @   s   e Zd ZdZg dZeZeZe	Z
	d!ddddeddd	eejejejf d
ee dededededeeeef  defddZdejdejfddZedejfddZedejfddZedejfddZdefddZdefdd ZdS )"Descriptiona  
    Extended descriptive statistics for data

    Parameters
    ----------
    data : array_like
        Data to describe. Must be convertible to a pandas DataFrame.
    stats : Sequence[str], optional
        Statistics to include. If not provided the full set of statistics is
        computed. This list may evolve across versions to reflect best
        practices. Supported options are:
        "nobs", "missing", "mean", "std_err", "ci", "ci", "std", "iqr",
        "iqr_normal", "mad", "mad_normal", "coef_var", "range", "max",
        "min", "skew", "kurtosis", "jarque_bera", "mode", "freq",
        "median", "percentiles", "distinct", "top", and "freq". See Notes for
        details.
    numeric : bool, default True
        Whether to include numeric columns in the descriptive statistics.
    categorical : bool, default True
        Whether to include categorical columns in the descriptive statistics.
    alpha : float, default 0.05
        A number between 0 and 1 representing the size used to compute the
        confidence interval, which has coverage 1 - alpha.
    use_t : bool, default False
        Use the Student's t distribution to construct confidence intervals.
    percentiles : sequence[float]
        A distinct sequence of floating point values all between 0 and 100.
        The default percentiles are 1, 5, 10, 25, 50, 75, 90, 95, 99.
    ntop : int, default 5
        The number of top categorical labels to report. Default is

    Attributes
    ----------
    numeric_statistics
        The list of supported statistics for numeric data
    categorical_statistics
        The list of supported statistics for categorical data
    default_statistics
        The default list of statistics

    See Also
    --------
    pandas.DataFrame.describe
        Basic descriptive statistics
    describe
        A simplified version that returns a DataFrame

    Notes
    -----
    The selectable statistics include:

    * "nobs" - Number of observations
    * "missing" - Number of missing observations
    * "mean" - Mean
    * "std_err" - Standard Error of the mean assuming no correlation
    * "ci" - Confidence interval with coverage (1 - alpha) using the normal or
      t. This option creates two entries in any tables: lower_ci and upper_ci.
    * "std" - Standard Deviation
    * "iqr" - Interquartile range
    * "iqr_normal" - Interquartile range relative to a Normal
    * "mad" - Mean absolute deviation
    * "mad_normal" - Mean absolute deviation relative to a Normal
    * "coef_var" - Coefficient of variation
    * "range" - Range between the maximum and the minimum
    * "max" - The maximum
    * "min" - The minimum
    * "skew" - The skewness defined as the standardized 3rd central moment
    * "kurtosis" - The kurtosis defined as the standardized 4th central moment
    * "jarque_bera" - The Jarque-Bera test statistic for normality based on
      the skewness and kurtosis. This option creates two entries, jarque_bera
      and jarque_beta_pval.
    * "mode" - The mode of the data. This option creates two entries in all tables,
      mode and mode_freq which is the empirical frequency of the modal value.
    * "median" - The median of the data.
    * "percentiles" - The percentiles. Values included depend on the input value of
      ``percentiles``.
    * "distinct" - The number of distinct categories in a categorical.
    * "top" - The mode common categories. Labeled top_n for n in 1, 2, ..., ``ntop``.
    * "freq" - The frequency of the common categories. Labeled freq_n for n in 1,
      2, ..., ``ntop``.
    rT   rU   r`   NT皙?Fr   numericcategoricalalphause_trB   ntopdatar   rm   rn   ro   rp   rB   rq   c             	   C   s  |}	t |tjtjfst|ddd}	|	jdkrt|}t|d}t|d}g }
d}|r4|
tj	 d}|rI|
d ||dkrBd	nd7 }|d7 }|sQ|sQt
d
t||
| _| jjd dkrjt
d| ddd | jjD | _dd | jjD | _|d urdd |D }|rt
d| d|d u rttnt|| _t|d| _d| jv | _d| jv | _| jr| jd  krt| jk rt
d ddgddgddgdd td| jd D dd td| jd D d }|D ]#}|| jv r| j|}| jd | ||  | j|d d   | _qt|d!dd"d#| _t| j| _t| jjd | jjd kr7t
d$t| jd%ksIt| jdkrMt
d&t |d'| _!d|  k rbdk sgt
d( t
d(t|d)| _"d S )*Nrr   r5   )maxdimr   rm   rn    categoryzand z4At least one of numeric and categorical must be Truer   z
Selecting z results in an empty DataFramec                 S      g | ]}t |qS r   )r   rf   Zdtr   r   r   rh   1      z(Description.__init__.<locals>.<listcomp>c                 S   rv   r   r   rw   r   r   r   rh   2  s    c                 S   rc   r   )DEFAULT_STATISTICSre   r   r   r   rh   7  s    z, z are not known statisticsrq   ra   rb   z"top must be a non-negative integerr^   	mode_frequpper_cilower_cir   jarque_bera_pvalc                 S      g | ]}d | qS Ztop_r   rf   ir   r   r   rh   J      c                 S   r~   Zfreq_r   r   r   r   r   rh   K  r   )r^   rW   r   ra   rb   rB   d)rs   r   zpercentiles must be distinctd   z.percentiles must be strictly between 0 and 100ro   z&alpha must be strictly between 0 and 1rp   )#r   r   Series	DataFramer   ndimr   appendr,   ZnumberrD   Zselect_dtypes_datashapeZdtypes_is_numeric_is_cat_likejoinlistry   _statsr   _ntop_compute_top_compute_freqr-   r]   index_percentilessortuniqueanyr   _alpha_use_t)selfrr   r   rm   rn   ro   rp   rB   rq   Zdata_arrZincludeZ	col_typesZundefZreplacementskeyidxr   r   r   __init__  s   





  $zDescription.__init__r'   returnc                    s    j  fdd| jD  S )Nc                    s   g | ]	}| j v r|qS r   r   )rf   sr&   r   r   rh   e  s    z(Description._reorder.<locals>.<listcomp>)locr   )r   r'   r   r&   r   _reorderd  s   zDescription._reorderc                 C   sT   | j }| j}|jd dkr|S |jd dkr|S tj||gdd}| || jj S )z
        Descriptive statistics for both numeric and categorical data

        Returns
        -------
        DataFrame
            The statistics
        r   r   r*   )rm   rn   r   r   concatr   r   columns)r   rm   rn   r'   r   r   r   frameg  s   
zDescription.framec           "   	      s  j jddjf }|j}|j\}}| }| }| }||   }|	 }	|	j|dk  |j|dk d   < j
rPt|d djd  }
ntjdjd  }
dd }||j}|jdkrt|tjrtj|d td	}tj|d tjd	}n/g }g }|jD ]}|j| }||d  ||d  qt|}t|}ntd }}|dk}t|jd tj}|| |j|  ||< |}z*dd
l m!} |	 }|D ]}||| j"r|| # $ r|| %tj||< qW n
 t&y   Y nw |jd dkr|'d|'d }n|}dd  |j fddddj}|	 }tj|j|dk< || }i dtj(tj)|tjd	|jd  |dd|jd | d|d|	d||
|	  d||
|	  d|d|d|d|dt*|d|+ d |, d!|d d"|d# d$|t-tjddg d%|t.dtj/  |d |d tj(||dtj(||d|0 d&}fd'd(|1 D }tjt2|3 |t2|4 d)}d*j5vr|S |jd dkr|'j6d+ 7t}n
tjj6d+ td,}t8t9d+|j d+|j krd-d. |jD |_nOd/}d+}|j} |rA|d09 }t9||j }t8t-|dkr>d1}|s%t9||  |d+  } d2t:t;|d+ d  d3}!d4|! d5fd6d.| D |_j5|j<  _5=tj>||gdd7S )8z
        Descriptive statistics for numeric data

        Returns
        -------
        DataFrame
            The statistics of the numeric columns
        Nr   rL   r   g      ?r5   c                 S   s   t | jtjr
| jn| jj}|  j|d}tri nddi}tj|fi |}t	|d r8t
|d |d fS |d jd dkrHdd |D S tjtjfS )Nr   ZkeepdimsTr   r   c                 S   rv   r   )floatrf   valr   r   r   rh     rx   z6Description.numeric.<locals>._mode.<locals>.<listcomp>)r   r   r,   Znumpy_dtypedropnaZto_numpyr   r   r^   Zisscalarr   r   rE   )Zserr   Zser_no_missingkwargsZmode_resr   r   r   _mode  s   z"Description.numeric.<locals>._moder   )is_extension_array_dtypeg      ?g      ?c                 S   s,   t | }|jd dk rt jfd S t|S )Nr   r5      )r,   rM   r   rE   r   )crG   r   r   r   _safe_jarque_bera  s   
z.Description.numeric.<locals>._safe_jarque_berac                    s   t  |  S r
   )r   r   )r.   )r   r   r   <lambda>      z%Description.numeric.<locals>.<lambda>expand)Zresult_typerT   r   rU   r?   rV   r{   r|   r@   rX   rZ   r\   r]   r$   r%   r=   r;      rY   r[   )r   r}   r^   rz   r_   c                        i | ]\}}| j v r||qS r   r   rf   kvr   r   r   
<dictcomp>       z'Description.numeric.<locals>.<dictcomp>)r   r   rB   r   )r   r   c                 S   s   g | ]}t d |  dqS )r   %)intrf   r   r   r   r   rh         z'Description.numeric.<locals>.<listcomp>Tr   Fz0.fz{0:z}%c                    s   g | ]}  |qS r   )formatr   )outputr   r   rh     r   r*   )?r   r   r   r   r   r@   countr?   abscopyr   r   tZppfr   ZnormZapplyTsizer   r   r   r,   rM   r   int64r   r   Z
atleast_1demptyZfullrE   Zpandas.api.typesr   r   isnullr   fillnaImportErrorZquantiler   onesr(   r$   r%   ZdiffZsqrtZpir_   itemsr   valueskeysr   r   astypeallZfloorlenstrtolistr   r   )"r   r'   cols_r   r@   r   r?   rZ   rV   qr   Zmode_valuesr^   Zmode_countsr   r   r   rz   Z_dfr   colrX   ZjbZnan_meanr\   resultsfinal
results_dfpercZdupeZscaler   Zfmtr   )r   r   r   r   rm   z  s   

$ 



	
"zDescription.numericc                    s  j jdddd jD f   jd } j} fdd D tjfddD tjd}i }i }D ]R}| }|jd	 j	kr[|j
dj	 ||< t|jdd
 ||< q6t|j
}|dgj	t|  7 }|||< t|}	|	tjgj	t|	  7 }	t|	||< q6dd tdj	d D }
tj|d|
|d}dd tdj	d D }
tj|d|
|d}tjtj|tjd jd	  |d jd	    |d}fdd| D }tjt| |t| dd}jrtj||gd	d}jrtj||gd	d}|S )z
        Descriptive statistics for categorical data

        Returns
        -------
        DataFrame
            The statistics of the categorical columns
        Nc                 S      g | ]}|qS r   r   rf   r   r   r   r   rh     r   z+Description.categorical.<locals>.<listcomp>r   c                    s   i | ]}| | j d dqS )T)	normalize)Zvalue_countsr   r&   r   r   r     r   z+Description.categorical.<locals>.<dictcomp>c                    s   i | ]
}| | j d  qS r   )r   r   )vcr   r   r     s    r   r   r   c                 S   r~   r   r   r   r   r   r   rh   -  r   object)r   r   r   c                 S   r~   r   r   r   r   r   r   rh   /  r   r   rj   c                    r   r   r   r   r   r   r   r   9  r   )r   r   r   r*   )r   r   r   r   r   r   r   r,   r   r   r   rM   Zilocr   r   rE   r]   r   r   r   r   r   r   r   r   r   r   )r   r   r   r`   ra   rb   r   Zsingler   Zfreq_valr   Ztop_dfZfreq_dfr   r   r   r   )r'   r   r   r   rn     sT    




zDescription.categoricalc              	   C   s   | j t}|   r|d}dd |jD }dd |jD }g }| D ]\}}|	dd |D  q)dd }t
|||dd	d
|didgt| dS )z
        Summary table of the descriptive statistics

        Returns
        -------
        SimpleTable
            A table instance supporting export to text, csv and LaTeX
        rt   c                 S   rv   r   r   r   r   r   r   rh   S  rx   z'Description.summary.<locals>.<listcomp>c                 S   rv   r   r   r   r   r   r   rh   T  rx   c                 S   r   r   r   )rf   r   r   r   r   rh   W  r   c                 S   s.   t | tr| S | d | krtt| S | dS )Nr   z0.4g)r   r   r   )r   r   r   r   
_formatterY  s
   
z'Description.summary.<locals>._formatterzDescriptive StatisticsZ	data_fmtsz%s)r   r   r   )headerstubstitleZtxt_fmtZ	datatypes)r   r   r   r   r   r   r   r   Ziterrowsr   r   r   )r   r'   r   r   rr   r   rowr   r   r   r   summaryG  s"   	
zDescription.summaryc                 C   s   t |   S r
   )r   r   Zas_textr   r   r   r   __str__i  r)   zDescription.__str__r
   ) __name__
__module____qualname____doc__Z_int_fmtrd   Znumeric_statisticsCATEGORICAL_STATISTICSZcategorical_statisticsry   Zdefault_statisticsr8   r   r,   ndarrayr   r   r   r   r   boolr   r   r   r   r   r   rm   rn   r   r   r   r   r   r   r   ri      sR    R	

Y 9"ri   ZReturnsr   zDescriptive statisticsZ
AttributeszSee Also)zpandas.DataFrame.describeNzBasic descriptive statistics)ri   Nz;Descriptive statistics class with additional output optionsTrk   Fr   rl   rr   r   rm   rn   ro   rp   rB   rq   r   c             
   C   s   t | |||||||djS )Nrl   )ri   r   )rr   r   rm   rn   ro   rp   rB   rq   r   r   r   describe  s   	r   c                   @   s   e Zd ZdZdd ZdS )Describez
    Removed.
    c                 C   s   t d)NzDescribe has been removed)NotImplementedError)r   Zdatasetr   r   r   r     s   zDescribe.__init__N)r   r   r   r   r   r   r   r   r   r     s    r   r   r
   )EZstatsmodels.compat.pandasr   r   r   Zstatsmodels.compat.scipyr   typingr   Zcollections.abcr   Znumpyr,   Zpandasr   Zpandas.core.dtypes.commonr	   Zscipyr   Zstatsmodels.iolib.tabler   Zstatsmodels.stats.stattoolsr   Zstatsmodels.tools.decoratorsr   Zstatsmodels.tools.docstringr   r   Zstatsmodels.tools.validationr   r   r   r   r8   ZarrayZ	QUANTILESr(   r/   r4   r6   r7   r<   r>   ZnanmeanZnanstdr1   r2   ZnanvarMISSINGrI   rK   rS   rd   r   Z_additionaltuplery   ri   r   ZdsZreplace_blockr   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s    






/   
=
	
