§
    tÖ]e«2  ã            	       óZ  — d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlZddlmZ dd	lmZmZ d
dlmZ dZdZdZg d¢Z e¦   «         Z e¦   «         Z ee¦  «        D ]\  ZZedz   ee<   eeedz   <   Œ edegde gdgdgdgdgdœd¬¦  «        dddddddœd„¦   «         Z!dS )aD	  Collection of imbalanced datasets.

This collection of datasets has been proposed in [1]_. The
characteristics of the available datasets are presented in the table
below.

 ID    Name           Repository & Target           Ratio  #S       #F
 1     ecoli          UCI, target: imU              8.6:1  336      7
 2     optical_digits UCI, target: 8                9.1:1  5,620    64
 3     satimage       UCI, target: 4                9.3:1  6,435    36
 4     pen_digits     UCI, target: 5                9.4:1  10,992   16
 5     abalone        UCI, target: 7                9.7:1  4,177    10
 6     sick_euthyroid UCI, target: sick euthyroid   9.8:1  3,163    42
 7     spectrometer   UCI, target: >=44             11:1   531      93
 8     car_eval_34    UCI, target: good, v good     12:1   1,728    21
 9     isolet         UCI, target: A, B             12:1   7,797    617
 10    us_crime       UCI, target: >0.65            12:1   1,994    100
 11    yeast_ml8      LIBSVM, target: 8             13:1   2,417    103
 12    scene          LIBSVM, target: >one label    13:1   2,407    294
 13    libras_move    UCI, target: 1                14:1   360      90
 14    thyroid_sick   UCI, target: sick             15:1   3,772    52
 15    coil_2000      KDD, CoIL, target: minority   16:1   9,822    85
 16    arrhythmia     UCI, target: 06               17:1   452      278
 17    solar_flare_m0 UCI, target: M->0             19:1   1,389    32
 18    oil            UCI, target: minority         22:1   937      49
 19    car_eval_4     UCI, target: vgood            26:1   1,728    21
 20    wine_quality   UCI, wine, target: <=4        26:1   4,898    11
 21    letter_img     UCI, target: Z                26:1   20,000   16
 22    yeast_me2      UCI, target: ME2              28:1   1,484    8
 23    webpage        LIBSVM, w7a, target: minority 33:1   34,780   300
 24    ozone_level    UCI, ozone, data              34:1   2,536    72
 25    mammography    UCI, target: minority         42:1   11,183   6
 26    protein_homo   KDD CUP 2004, minority        111:1  145,751  74
 27    abalone_19     UCI, target: 19               130:1  4,177    10

References
----------
.. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
   Imbalanced Data Learning and their Application in Bioinformatics."
   Dissertation, Georgia State University, (2011).
é    N)ÚOrderedDict)ÚBytesIO)Úmakedirs)ÚisfileÚjoin)Úurlopen)Úget_data_home)ÚBunchÚcheck_random_stateé   )Úvalidate_paramszGhttps://zenodo.org/record/61452/files/benchmark-imbalanced-learn.tar.gzÚxzdata.npz)ÚecoliÚoptical_digitsÚsatimageÚ
pen_digitsÚabaloneÚsick_euthyroidÚspectrometerÚcar_eval_34ÚisoletÚus_crimeÚ	yeast_ml8ÚsceneÚlibras_moveÚthyroid_sickÚ	coil_2000Ú
arrhythmiaÚsolar_flare_m0ÚoilÚ
car_eval_4Úwine_qualityÚ
letter_imgÚ	yeast_me2ÚwebpageÚozone_levelÚmammographyÚprotein_homoÚ
abalone_19é   ÚbooleanÚrandom_state)Ú	data_homeÚfilter_dataÚdownload_if_missingr,   ÚshuffleÚverboseT)Úprefer_skip_nested_validationFc           
      ód  — t          | ¬¦  «        } t          | d¦  «        }t          ¦   «         }|€t                               ¦   «         }nçt                               ¦   «         }	g }|D ]É}
t          |
t          ¦  «        r.|
|	vrt          |
› d|	› ¦  «        ‚|                     |
¦  «         ŒEt          |
t          ¦  «        rP|
dk     s|
dk    r#t          d|
› dt          dd	¦  «        › ¦  «        ‚|                     t          |
         ¦  «         Œªt          d
t          |
¦  «        › d¦  «        ‚|D ]z}
t          t          t          |
         ¦  «        z   t          z   }t          ||¦  «        }t          |¦  «        }|r‹|s‰t!          |d¬¦  «         |rt#          dt$          z  ¦  «         t'          t)          t$          ¦  «                             ¦   «         ¦  «        }t-          j        |¬¦  «        }|                     |¬¦  «         n|s|st3          d¦  «        ‚t5          j        |¦  «        }|d         |d         }}|rSt5          j        |j        d         ¦  «        }t=          |¦  «        }|                     |¦  «         ||         }||         }tA          |||
¬¦  «        ||
<   Œ||S )aÅ  Load the benchmark datasets from Zenodo, downloading it if necessary.

    .. versionadded:: 0.3

    Parameters
    ----------
    data_home : str, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    filter_data : tuple of str/int, default=None
        A tuple containing the ID or the name of the datasets to be returned.
        Refer to the above table to get the ID and name of the datasets.

    download_if_missing : bool, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance or None, default=None
        Random state for shuffling the dataset.
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : bool, default=False
        Whether to shuffle dataset.

    verbose : bool, default=False
        Show information regarding the fetching.

    Returns
    -------
    datasets : OrderedDict of Bunch object,
        The ordered is defined by ``filter_data``. Each Bunch object ---
        referred as dataset --- have the following attributes:

        dataset.data : ndarray of shape (n_samples, n_features)

        dataset.target : ndarray of shape (n_samples,)

        dataset.DESCR : str
            Description of the each dataset.

    Notes
    -----
    This collection of datasets have been proposed in [1]_. The
    characteristics of the available datasets are presented in the table
    below.

    +--+--------------+-------------------------------+-------+---------+-----+
    |ID|Name          | Repository & Target           | Ratio | #S      | #F  |
    +==+==============+===============================+=======+=========+=====+
    |1 |ecoli         | UCI, target: imU              | 8.6:1 | 336     | 7   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |2 |optical_digits| UCI, target: 8                | 9.1:1 | 5,620   | 64  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |3 |satimage      | UCI, target: 4                | 9.3:1 | 6,435   | 36  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |4 |pen_digits    | UCI, target: 5                | 9.4:1 | 10,992  | 16  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |5 |abalone       | UCI, target: 7                | 9.7:1 | 4,177   | 10  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |6 |sick_euthyroid| UCI, target: sick euthyroid   | 9.8:1 | 3,163   | 42  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |7 |spectrometer  | UCI, target: >=44             | 11:1  | 531     | 93  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |8 |car_eval_34   | UCI, target: good, v good     | 12:1  | 1,728   | 21  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |9 |isolet        | UCI, target: A, B             | 12:1  | 7,797   | 617 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |10|us_crime      | UCI, target: >0.65            | 12:1  | 1,994   | 100 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |11|yeast_ml8     | LIBSVM, target: 8             | 13:1  | 2,417   | 103 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |12|scene         | LIBSVM, target: >one label    | 13:1  | 2,407   | 294 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |13|libras_move   | UCI, target: 1                | 14:1  | 360     | 90  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |14|thyroid_sick  | UCI, target: sick             | 15:1  | 3,772   | 52  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |15|coil_2000     | KDD, CoIL, target: minority   | 16:1  | 9,822   | 85  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |16|arrhythmia    | UCI, target: 06               | 17:1  | 452     | 278 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |17|solar_flare_m0| UCI, target: M->0             | 19:1  | 1,389   | 32  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |18|oil           | UCI, target: minority         | 22:1  | 937     | 49  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |19|car_eval_4    | UCI, target: vgood            | 26:1  | 1,728   | 21  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |20|wine_quality  | UCI, wine, target: <=4        | 26:1  | 4,898   | 11  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |21|letter_img    | UCI, target: Z                | 26:1  | 20,000  | 16  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |22|yeast_me2     | UCI, target: ME2              | 28:1  | 1,484   | 8   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |23|webpage       | LIBSVM, w7a, target: minority | 33:1  | 34,780  | 300 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |24|ozone_level   | UCI, ozone, data              | 34:1  | 2,536   | 72  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |25|mammography   | UCI, target: minority         | 42:1  | 11,183  | 6   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |26|protein_homo  | KDD CUP 2004, minority        | 111:1 | 145,751 | 74  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |27|abalone_19    | UCI, target: 19               | 130:1 | 4,177   | 10  |
    +--+--------------+-------------------------------+-------+---------+-----+

    References
    ----------
    .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
       Imbalanced Data Learning and their Application in Bioinformatics."
       Dissertation, Georgia State University, (2011).
    )r-   ÚzenodoNz8 is not a dataset available. The available datasets are r*   é   zThe dataset with the ID=z* is not an available dataset. The IDs are é   z1The value in the tuple should be str or int. Got z	 instead.T)Úexist_okzDownloading %s)Úfileobj)Úpathz1Data not found and `download_if_missing` is FalseÚdataÚlabelr   )r:   ÚtargetÚDESCR)!r	   r   r   ÚMAP_NAME_IDÚkeysÚ
isinstanceÚstrÚ
ValueErrorÚappendÚintÚrangeÚMAP_ID_NAMEÚtypeÚPRE_FILENAMEÚPOST_FILENAMEr   r   ÚprintÚURLr   r   ÚreadÚtarfileÚopenÚ
extractallÚIOErrorÚnpÚloadÚarangeÚshaper   r0   r
   )r-   r.   r/   r,   r0   r1   Ú
zenodo_dirÚdatasetsÚfilter_data_Ú	list_dataÚitÚfilenameÚ	availableÚfÚtarr:   ÚXÚyÚindÚrngs                       ú9lib/python3.11/site-packages/imblearn/datasets/_zenodo.pyÚfetch_datasetsrc   d   sõ  € õN ¨	Ð2Ñ2Ô2€IÝi Ñ*Ô*€JÝ‰}Œ}€HàÐÝ"×'Ò'Ñ)Ô)ˆˆå×$Ò$Ñ&Ô&ˆ	ØˆØð 	ð 	ˆBÝ˜"cÑ"Ô"ð Ø˜YÐ&Ð&Ý$Øð Bð BØ6?ðBð Bñô ð ð
 !×'Ò'¨Ñ+Ô+Ð+Ð+Ý˜B¥Ñ$Ô$ð Ø˜’66˜R "šW˜WÝ$ð*°2ð *ð *å   B™<œ<ð*ð *ñô ð ð !×'Ò'­°B¬Ñ8Ô8Ð8Ð8å ð0Ý  ™HœHð0ð 0ð 0ñô ð ð ð 9ñ 9ˆÝ¥#¥k°"¤oÑ"6Ô"6Ñ6½ÑFˆÝ˜
 HÑ-Ô-ˆÝ˜8Ñ$Ô$ˆ	àð 	O yð 	OÝZ¨$Ð/Ñ/Ô/Ð/Øð .ÝÐ&­Ñ,Ñ-Ô-Ð-Ý¥™œ×)Ò)Ñ+Ô+Ñ,Ô,ˆAÝ”, qÐ)Ñ)Ô)ˆCØNŠN 
ˆNÑ+Ô+Ð+Ð+Ø$ð 	O¨Yð 	OÝÐMÑNÔNÐNåŒwxÑ Ô ˆØFŒ|˜T 'œ]ˆ1ˆàð 	Ý”)˜AœG AœJÑ'Ô'ˆCÝ$ \Ñ2Ô2ˆCØKŠK˜ÑÔÐØ#”ˆAØ#”ˆAå !¨A°RÐ8Ñ8Ô8ˆ‰‰à€Oó    )"Ú__doc__rM   Úcollectionsr   Úior   Úosr   Úos.pathr   r   Úurllib.requestr   ÚnumpyrQ   Úsklearn.datasetsr	   Úsklearn.utilsr
   r   Úutils._param_validationr   rK   rH   rI   ÚMAP_NAME_ID_KEYSr>   rF   Ú	enumerateÚvÚkrA   Útuplerc   © rd   rb   ú<module>ru      sÈ  ðð(ð (ðZ €€€Ø #Ð #Ð #Ð #Ð #Ð #Ø Ð Ð Ð Ð Ð Ø Ð Ð Ð Ð Ð Ø  Ð  Ð  Ð  Ð  Ð  Ð  Ð  Ø "Ð "Ð "Ð "Ð "Ð "à Ð Ð Ð Ø *Ð *Ð *Ð *Ð *Ð *Ø 3Ð 3Ð 3Ð 3Ð 3Ð 3Ð 3Ð 3à 5Ð 5Ð 5Ð 5Ð 5Ð 5àO€Ø€Ø€ðð ð Ð ð< ˆk‰mŒm€Øˆk‰mŒm€ØˆIÐ&Ñ'Ô'ð ð D€A€qØ˜‘U€KNØ€KA‘ÑÐð €à˜C[Ø˜e}Ø )˜{Ø'Ð(Ø;Ø;ðð ð #'ð
ñ 
ô 
ð ØØØØØð{ð {ð {ð {ñ
ô 
ð{ð {ð {rd   