
    _nd                     n
   d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlZd dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*Z*d dl+m,Z, dZ-dZ.e-e.z   Z/d Z0d Z1d Z2d Z3d Z4d  Z5ej6        7                    d!eef          d"             Z8d# Z9d$ Z:d% Z;d& Z<d' Z=d( Z>d) Z?d* Z@d+ ZAd, ZBd- ZCd. ZDd/ ZEd0 ZFd1 ZGd2 ZHd3 ZId4 ZJd5 ZKd6 ZLd7 ZMe#d8             ZNd9 ZOej6        7                    d!eef          d:             ZPd; ZQd< ZRd= ZSd> ZTe#d?             ZUej6        7                    d!eef          d@             ZVdA ZWdB ZXdC ZYe#dD             ZZdE Z[dF Z\ej6        7                    dGej]        ej^        ej_        g          dH             Z`dI ZadJ ZbdK ZcdL ZddM ZedN ZfdO ZgdP Zhe#dQ             ZidR ZjdS ZkdT Zlej6        7                    d!eeef          dU             Zmej6        7                    dVejn        ejo        g          dW             ZpdX Zqej6        7                    dYejr        ejo        dZfejs        ejo        dZfejn        ejn        d[fejo        ejo        d[fg          d\             Ztej6        7                    d] ed^_           ed^_           ed^_          g          d`             Zuda Zve#db             Zwe%dc             Zxe#ej6        7                    ddeeeg          de                         Zyej6        7                    ddeeeg          ej6        7                    dfdgezdhfdie{djfg          dk                         Z|ej6        7                    ddee ej}        ee#l          g          ej6        7                    dmdn do g          ej6        7                    dpdidgg          dq                                     Z~ej6        7                    ddeeeg          dr             Zej6        7                    d!eeeg          ej6        7                    dsdtdugdddvddwdxdydzf	dd{ ddvddwd|dydzf	dd} ddvd~ddd|df	ddd dvd~d ddydf	dddddd ddydf	dg          d                         Zej6        7                    deddddddgfee-ff          d             Zd Ze#d             Zej6        7                    ddeeeeg          d             ZdS )    )MappingN)sparse)
strip_tagsstrip_accents_unicodestrip_accents_ascii)HashingVectorizer)CountVectorizer)TfidfTransformer)TfidfVectorizer)ENGLISH_STOP_WORDS)train_test_split)cross_val_score)GridSearchCV)Pipeline)	LinearSVC)clone)assert_array_almost_equal)assert_array_equal)IS_PYPY)assert_almost_equalfails_if_pypyassert_allclose_dense_sparseskip_if_32bit)defaultdict)partial)StringIO)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc                 D    t          |                                           S N)r   upperss    Jlib/python3.11/site-packages/sklearn/feature_extraction/tests/test_text.py	uppercaser%   >   s     ##))+++    c                 .    |                      dd          S )N   ée)replacer"   s    r$   strip_eacuter+   B   s    99T3r&   c                 *    |                                  S r    splitr"   s    r$   split_tokenizer/   F   s    7799r&   c                     dgS )Nthe_ultimate_feature r"   s    r$   lazy_analyzer3   J   s    "##r&   c                  d   d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d	} d
}t          |           |k    sJ d} d}t          |           |k    sJ d} d
}t          |           |k    sJ d S )N   àáâãäåçèéêëaaaaaaceeee   ìíîïñòóôõöùúûüýiiiinooooouuuuy   إu   ا   this is à testthis is a testu   öou   ̀́̂̃ u   ȫr   aexpecteds     r$   test_strip_accentsrA   N   s    AH ##x////(A H ##x//// 	AH ##x//// 	AH ##x//// 	AH ##x//// 	#AH ##x//// 	AH ##x//////r&   c                      d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d} d}t          |           |k    sJ d S )	Nr5   r6   r7   r8   r9   r=   r:   r;   r   r>   s     r$   test_to_asciirC   r   s     AHq!!X----(A Hq!!X---- 	AHq!!X---- 	AHq!!X------r&   
Vectorizerc                     | d                                           }d}g d} ||          |k    sJ d}g d} ||          |k    sJ  | d                                           }t          d	          }g d
} ||          |k    sJ  | t                                                     }d}g d} ||          |k    sJ  | t          d                                           }d}g d} ||          |k    sJ d S )Nasciistrip_accents:   J'ai mangé du kangourou  ce midi, c'était pas très bon.)
aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.)thisistestreallymetharry	yesterdayfile)input'This is a test with a file-like object!)rT   rU   rV   withr[   likeobjectpreprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.)
AIMANGEDU	KANGOUROUCEMIDIETAITPASTRESBON)	tokenizerrH   )
zj'airK   rL   rM   rN   zmidi,zc'etaitrQ   rR   zbon.)build_analyzerr   r%   r/   )rD   watextr@   s       r$   test_word_analyzer_unigramsrq      st   	'	*	*	*	9	9	;	;BGD  H 2d88x?DLLLH2d88x	&	!	!	!	0	0	2	2B=>>DGGGH2d88x 
	+	+	+	:	:	<	<BHD  H 2d88x 
nG	D	D	D	S	S	U	UBGD  H 2d88xr&   c                  |    t          ddd                                          } d}g d} | |          |k    sJ d S )Nwordunicode      analyzerrH   ngram_rangerI   )rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r   rn   )ro   rp   r@   s      r$   'test_word_analyzer_unigrams_and_bigramsr{      se    	yf
 
 
n  HD  H* 2d88xr&   c                     d} |                      d          }t          dd                                          }t          j        t
                    5   ||           d d d            n# 1 swxY w Y   t          ddd                                          }t          j        t
                    5   ||           d d d            d S # 1 swxY w Y   d S )	NrI   zutf-8ru   rF   )rz   encodingchar      )ry   rz   r}   )encoder   rn   pytestraisesUnicodeDecodeError)rp   
text_bytesro   cas       r$   test_unicode_decode_errorr      s]    HDW%%J 
Vg	>	>	>	M	M	O	OB	)	*	*  
:               
Vg
 
 
n  
)	*	*  
:                 s$   A,,A03A04CCCc                     t          ddd                                          } d}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ d
}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ t          ddd                                          } t          d          }g d} | |          d d         |k    sJ d S )Nr~   rt   r   rx   u9   J'ai mangé du kangourou  ce midi, c'était pas très bon)zj'az'aizai zi mz ma   )zs tresz tres ztres bzres bozes bon1This 
	is a test, really.

 I met Harry yesterday)thihisis zs iz is)z yesteyesteresterdsterdaterdayr[   r\   ry   rz   r]   r   rn   r   cngarp   r@   s      r$   test_char_ngram_analyzerr      sS   yf  n 	 GD222H4::bqb>X%%%%AAAH4::bcc?h&&&&BD222H4::bqb>X%%%%AAAH4::bcc?h&&&&v6  n 	 =>>D222H4::bqb>X%%%%%%r&   c                  f   t          ddd                                          } d}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ t          d
dd                                          } t          d          }g d} | |          d d         |k    sJ d S )Nchar_wbrt   r   rx   r   )z thr   r   r   z thir   )r   r   r   r   zerday r   r[   r   zA test with a file-like object!)z a z tetesestzst z tesr   r   r   s      r$   test_char_wb_ngram_analyzerr     s    )  n 	 CD333H4::bqb>X%%%%AAAH4::bcc?h&&&&yf  n 	 566D:::H4::bqb>X%%%%%%r&   c                  `   t          ddd                                          } d}g d} | |          d d         |k    sJ g d} | |          d	d          |k    sJ t          d
dd                                          }t          |          } ||           | |          k    sJ d S )Nrs   rt   r   rx   r   )zthis is testzis test reallyztest really metr   )ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayr[   r   r   )r   rp   r@   	cnga_filer[   s        r$   test_word_ngram_analyzerr   $  s    yf  n 	 CDDDDH4::bqb>X%%%%  H
 4::bcc?h&&&&v6  n  D>>D9T??dd4jj((((((r&   c                     ddd} t          |                                           }t          t          t          t          t          t                    fD ]} ||           }t          |          }|	                    t                     t          |t                    r|j        | k    sJ nt          |j                  |k    sJ |                    t                    }|j        d         t!          |          k    sJ  ||           }t          |          }|                    |          }t!          |          |j        d         k    sJ d S )Nr   rv   pizzabeer
vocabulary)setkeysdictlistiterr   r   intr   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_	transformshapeleninverse_transform)vocabtermstypvvectXinvs          r$   &test_countvectorizer_custom_vocabularyr   ;  sD   ##E

E dD'+s";";< & &CJJ!,,,   a!! 	2#u,,,,,t'((E1111NN>**wqzSZZ''''CJJ!,,,$$Q''3xx171:%%%%%& &r&   c                  D   ddg} t          dt          |           fdt                      fg          }|                    t                    }t          |j        d         j                  t          |           k    sJ |j        d         t          |           k    sJ d S )Nr   r   countr   tfidfrv   )
r   r   r   fit_transformALL_FOOD_DOCSr   named_stepsr   r   r   )what_we_likepiper   s      r$   /test_countvectorizer_custom_vocabulary_pipeliner   P  s    V$Lo>>>?&(()	
 D 	=))At(455\9J9JJJJJ71:\********r&   c                      ddd} d}t          j        t          |          5  t          |           }|                    dg           d d d            d S # 1 swxY w Y   d S )Nr   r   z$Vocabulary contains repeated indicesmatchr   pasta_sizilianar   r   
ValueErrorr   r   )r   msgr   s      r$   7test_countvectorizer_custom_vocabulary_repeated_indicesr   ]  s    ##E
0C	z	-	-	- & &%000#$%%%& & & & & & & & & & & & & & & & & &s   'AAAc                      ddd} t          j        t          d          5  t          |           }|                    dg           d d d            d S # 1 swxY w Y   d S )Nrv   rw   r   zdoesn't contain indexr   r   pasta_verdurar   r   r   s     r$   0test_countvectorizer_custom_vocabulary_gap_indexr   e  s    ##E	z)@	A	A	A $ $%000/"###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   'AAAc                  z   t                      } |                     d           |                                 t          k    sJ |                     d           t	          j        t                    5  |                                  d d d            n# 1 swxY w Y   |                     d           t	          j        t                    5  |                                  d d d            n# 1 swxY w Y   g d}|                     |           |                                 t          |          k    sJ d S )Nenglish
stop_words_bad_str_stop__bad_unicode_stop_)someotherwords)r   
set_paramsget_stop_wordsr   r   r   r   r   )cvstoplists     r$   test_countvectorizer_stop_wordsr   l  s   			BMMYM'''"44444MM-M...	z	"	"  
              MM1M222	z	"	"  
              )))HMMXM&&&#h--//////s$   3BBBC//C36C3c                  p   t          j        t          d          5  t          g           } |                     dg           d d d            n# 1 swxY w Y   t          j        t          d          5  t          dd          }|                    g d           d d d            d S # 1 swxY w Y   d S )	Nzempty vocabularyr   r   foo      ?r   )max_dfr   )zto be or not to bez
and me toozand so do your   )r   r   s     r$   %test_countvectorizer_empty_vocabularyr   {  sL   	z);	<	<	<  "---%               
z);	<	<	< E E39===	CCCDDDE E E E E E E E E E E E E E E E E Es#   'AAA5)B++B/2B/c                      t                      } |                     t          d d                   }|                     t          dd                    }|j        d         |j        d         k    sJ d S )Nr   rv   )r   r   r   r   )r   X1X2s      r$   test_fit_countvectorizer_twicer     sh    			B			-+	,	,B			-+	,	,B8A;"(1+%%%%%%r&   c                      g d} d}t          |          }|                    |            g d}|                                }t          ||           dS )zCheck `get_feature_names_out()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    z&This is the 1st document in my corpus.z This document is the 2nd sample.zAnd this is the 3rd one.zIs this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\btoken_pattern)documentonesampleN)r   r   get_feature_names_outr   )corpusr   
vectorizerr@   feature_names_outs        r$   )test_countvectorizer_custom_token_patternr     sr    
  F ?M }===JV$$$,,,H"88::((33333r&   c                      g d} d}d}t          |          }t          j        t          |          5  |                    |            ddd           dS # 1 swxY w Y   dS )zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    r   z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token patternr   r   Nr   r   r   r   r   )r   r   err_msgr   s       r$   <test_countvectorizer_custom_token_pattern_with_several_groupr     s    
  F AM<G }===J	z	1	1	1  v                 s   AAAc                  z   g d} d}t          d|           }t          j        t          |          5  |                    |            d d d            n# 1 swxY w Y   t          j                    5  t          j        dt                     |                    |            d d d            d S # 1 swxY w Y   d S )N)SampleUpperCase
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaser   r   error)	r   r   warnsUserWarningr   warningscatch_warningssimplefilterr   )r   messager   s      r$   'test_countvectorizer_uppercase_in_vocabr    sE    ;::J	)  !4JGGGJ	k	1	1	1 # #z"""# # # # # # # # # # # # # # # 
	 	"	" ) )g{333Z((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) )s#   AAA30B00B47B4c                      g dg dg dg} t          dd                              |           }g d}|                    |          }t          ||           dS )	z0Check get_feature_names_out for TfidfTransformerrv   rv   rv   rv   rv   r   rv   r   r   Tl2
smooth_idfnorm)r?   cbN)r   r   r   r   )r   trfeature_names_inr   s       r$   %test_tf_transformer_feature_names_outr    sq    	IIIyyy)A	T	5	5	5	9	9!	<	<B&001ABB'):;;;;;r&   c                     g dg dg dg} t          dd          }|                    |                                           }|dk                                    sJ t	          |dz                      d	
          g d           g dg dg dg} t          dd          }|                    |                                           }|dk                                    sJ d S )Nr	  r
  r  Tr  r  r   rw   rv   axisr   r   r   )r   r   toarrayallr   sumr   r  r   s      r$   test_tf_idf_smoothingr    s    	IIIyyy)A	T	5	5	5BQ''))EQJ uaxnn!n44oooFFF 
IIIyyy)A	T	5	5	5BQ''))EQJr&   c                     g dg dg dg} t          dd          }|                    |                                           }|dk                                    sJ t	          |dz                      d	
          g d           g dg dg dg} t          dd          }d}t          j        t          |          5  |                    |                                            d d d            d S # 1 swxY w Y   d S )Nr	  r
  r  Fr  r  r   rw   rv   r  r  zdivide by zeror   )	r   r   r  r  r   r  r   r  RuntimeWarning)r   r  r   in_warning_messages       r$   test_tfidf_no_smoothingr!    s[   	IIIyyy)A	U	6	6	6BQ''))EQJ uaxnn!n44oooFFF 
IIIyyy)A	U	6	6	6B)	n,>	?	?	? & &
##%%%& & & & & & & & & & & & & & & & & &s    (C55C9<C9c                  ,   dgdgdgg} t          ddd           }|                    |                                           }|d         dk    sJ |d         |d         k    sJ |d         |d         k    sJ |d         dk     sJ |d         dk     sJ d S )Nrv   rw   r   TF)sublinear_tfuse_idfr  r   )r   r   r  r  s      r$   test_sublinear_tfr%    s    
qcA3A	tU	F	F	FBQ''))E8q====8eAh8eAh8a<<<<8a<<<<<<r&   c                  
   t          t          d d                   } t          d         g}t          t                    dz
  }t          d          }|                    |           }t          |d          r|                                }|d|j        d         f         dk    sJ t          |j        	          }||fD ]}|                    |          }t          |d          r|                                }|j        }|d|d
         f         dk    sJ |d|d         f         dk    sJ |d|d         f         dk    sJ d|vsJ d|vsJ |d|d         f         dk    sJ |d|d         f         dk    sJ |d|d         f         dk    sJ |d|d         f         dk    sJ t          d          }	|	
                    |                              |                                          }
t          |	j                  t          |j                  k    sJ |
j        |t          |j                  fk    sJ |	                    |                                          }|j        t          |          t          |j                  fk    sJ t          dd          }|
                    |                              |                                          }t          |d          rJ t          d          }t          j        t                     5  |                    |           d d d            n# 1 swxY w Y   t#          t%          j        |d          dg|z             t          t          d d                   } t)          d          }|j        |_        |                    |                                           }|j        rJ t#          |
|           |                    |                                          }t#          ||           t          d 	          }t          j        t                     5  |                    |            d d d            n# 1 swxY w Y   |                    dd           |                                }d}t3          |          } ||          }||k    sJ |                    dd            t          j        t                     5  |                                 d d d            n# 1 swxY w Y   d |_        t          j        t                     5  |                                 d d d            d S # 1 swxY w Y   d S )!Nrv         ?r   tocsrr   r   rw   r   saladtomatowaterthe	copyrightcokeburgerr   l1r  F)r  r$  idf_Tr$  r  r   rF   )rH   r   rI   _gabbledegook_)rH   rb   _invalid_analyzer_type_)r   r   r   r   r   hasattrr*  r   r   r   r   r  r4  r   r   r   r   r   npr  r   r   fixed_vocabulary_r   build_preprocessorr	   rn   )
train_data	test_datan_trainv1counts_trainv2r   counts_testr   t1r   
tfidf_testt2tft3tvtfidf2tfidf_test2v3	processorrp   r@   results                          r$   test_vectorizerrN     s~   mCRC())Jr"#I-  1$G 
	$	$	$B##J//L|W%% ,#))++2>'223q8888 
BN	3	3	3B "X 8 8kk),,;(( 	.%++--K]
1j112a77771j223q88881j112a7777 J&&&& *,,,, 1j001Q66661j223q88881j001Q66661j112a77777 
t	$	$	$BFF<  **<88@@BBErw<<3r~......;7C$7$788888 k**2244JIBN0C0CDDDDD 
tU	3	3	3B				'	'	5	5	=	=	?	?Br6""""" 
$	'	'	'B	z	"	" # #
\"""# # # # # # # # # # # # # # # bfRa0003%'/BBB mCRC())J	d	#	#	#B	BIj))1133F####eV,,, ,,y))1133Kj+666 
D	)	)	)B	z	"	" ! !
Z   ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! MM5M999%%''IGD"4((HYt__Fv MM 0tMDDD	z	"	"    
                              .BM	z	"	"  
                 sH   L<<M M Q))Q-0Q-3TTT?U!!U%(U%c                     d\  } }}}t          | |||          }|                    t                     |j        j        | k    sJ |j        j        |k    sJ |j        j        |k    sJ |j        j        |k    sJ d|_        d|_        d|_        d|_        |j        j        | k    sJ |j        j        |k    sJ |j        j        |k    sJ |j        j        |k    sJ |                    t                     |j        j        |j        k    sJ |j        j        |j        k    sJ |j        j        |j        k    sJ |j        j        |j        k    sJ d S )N)r  FFF)r  r$  r  r#  r2  T)r   r   r   _tfidfr  r$  r  r#  )r  r$  r  r#  rH  s        r$   test_tfidf_vectorizer_settersrQ  g  s   .G+D':|	7z
 
 
B FF>9>T!!!!9''''9:----9!\1111 BGBJBMBO9>T!!!!9''''9:----9!\1111FF>9>RW$$$$9
****92=00009!R_444444r&   c                     t                      } |                     t                    }|j        }|j        t          t                    | j        fk    sJ |j        | j        k    sJ t          j	        |j
                  dk    sJ t          j	        |j
                  dk     sJ t          j        |j
                  dk    sJ t          j        |j
                  dk     sJ t          |j        d                   D ];}t          t          j                            |d         j
        d          d           <t          dd          } |                     t                    }|j        t          t                    | j        fk    sJ |j        | j        k    sJ |j        }||k    sJ |d|z  k     sJ t          j	        |j
                  dk    sJ t          j        |j
                  dk     sJ t          |j        d                   D ];}t          t          j                            |d         j
        d          d           <d S )	Nr'  r   rv   rw   r   ru   r2  )rz   r  )r
   r   r   nnzr   r   
n_featuresdtyper9  mindatamaxranger   linalgr  )r   r   	token_nnzi
ngrams_nnzs        r$   test_hashing_vectorizerr^    s'   A	M""AI7s=))1<888887ag 6!&>>B6!&>>A6!&>>A6!&>>A 171: ? ?BINN1Q49a88#>>>> 	f4888A	M""A7s=))1<888887ag J	!!!!I%%%% 6!&>>B6!&>>A 171: ? ?BINN1Q49a88#>>>>? ?r&   c                  j   t          d          } t          j        t                    5  |                                  d d d            n# 1 swxY w Y   | j        rJ |                     t                    }|j        \  }}t          | j
                  |k    sJ |                                 }t          |t          j                  sJ |j        t          k    sJ t          |          |k    sJ t!          g d|           t#          |          D ]%\  }}|| j
                            |          k    sJ &g d}t          |          } |                                 }t!          g d|           | j        sJ t#          |          D ]%\  }}|| j
                            |          k    sJ &d S )Nr(  r)  	r   r1  celerir0  r   r+  	sparklingr,  r-  r   )r   r   r   r   r   r:  r   r   r   r   r   r   r9  ndarrayrU  r`   r   	enumerateget)r   r   	n_samplesrT  feature_namesidxnamer   s           r$   test_feature_namesrj    sG   		$	$	$B 
z	"	" # #
  """# # # # # # # # # # # # # # ##### 	''AGIzr~*,,,,,,..MmRZ00000&((((}++++
	
 
	
 
	
 	   }-- / /	Tbn((.......
 
 
E 
E	*	*	*B,,..M
	
 
	
 
	
 	   }-- / /	Tbn((......./ /s   AAAc                     h d}h d} | dd          }|                     t                     t          |j                  |k    sJ |j        |k    sJ d S )N>   r   r   r+  r1  >   r.  r0  r-  ra  r,  r/  rb  g333333?   )r   max_features)r   r   r   r   stop_words_)rD   expected_vocabularyexpected_stop_wordsr   s       r$   test_vectorizer_max_featuresrq    s    >>>   3Q777JNN=!!!z%&&*=====!%8888888r&   c                  N   t          d          } t          d          }t          d           }|                     t                                        d          }|                    t                                        d          }|                    t                                        d          }|                                 }|                                }|                                }d|                                k    sJ d|                                k    sJ d|                                k    sJ d|t          j        |                   k    sJ d|t          j        |                   k    sJ d|t          j        |                   k    sJ d S )Nrv   rm  r   r   r     r.  )r   r   r   r  r   rX  r9  argmax)	cv_1cv_3cv_Nonecounts_1counts_3counts_None
features_1
features_3features_Nones	            r$   "test_count_vectorizer_max_featuresr    s    ***D***D4000G!!.115515==H!!.115515==H''77;;;CCK++--J++--J1133M !!!!!! Jry2233333Jry2233333M")K"8"89999999r&   c                  H   g d} t          dd          }|                    |            d|j                                        v sJ t	          |j                                                  dk    sJ t	          |j                  dk    sJ d|_        |                    |            d|j                                        vsJ t	          |j                                                  d	k    sJ d|j        v sJ t	          |j                  d
k    sJ d|_        |                    |            d|j                                        vsJ t	          |j                                                  d	k    sJ d|j        v sJ t	          |j                  d
k    sJ d S )Nabcdeaeatr~   r   ry   r   r?   r   r   r(  rl  rw   rv   )r   r   r   r   r   rn  r   r=  r   s     r$   test_vectorizer_max_dfr     s   %%%IF3777DHHY$"''))))))t$$&&''1,,,,t  A%%%%DKHHYd&++------t$$&&''1,,,,$"""""t  A%%%%DKHHYd&++------t$$&&''1,,,,$"""""t  A%%%%%%r&   c                  H   g d} t          dd          }|                    |            d|j                                        v sJ t	          |j                                                  dk    sJ t	          |j                  dk    sJ d|_        |                    |            d	|j                                        vsJ t	          |j                                                  dk    sJ d	|j        v sJ t	          |j                  d
k    sJ d|_        |                    |            d	|j                                        vsJ t	          |j                                                  dk    sJ d	|j        v sJ t	          |j                  dk    sJ d S )Nr  r~   rv   )ry   min_dfr?   r   r   rw   r  rl  g?r   )r   r   r   r   r   rn  r  r  s     r$   test_vectorizer_min_dfr  7  s   %%%IF1555DHHY$"''))))))t$$&&''1,,,,t  A%%%%DKHHYd&++------t$$&&''1,,,,$"""""t  A%%%%DKHHYd&++------t$$&&''1,,,,$"""""t  A%%%%%%r&   c                  "   ddg} t          dd          }|                    |                                           }t          g d|                                           t          g dg dg|           t          ddd	
          }|                    |                                           }t          g dg dg|           t          ddd	t
          j                  }|                    |           }|j        t
          j        k    sJ d S )Naaabcabbder~   r   r  )r?   r  r  dr)   )r   rv   rv   r   r   )rv   rw   r   rv   rv   T)ry   r   binary)rv   rv   rv   r   r   )rv   rv   r   rv   rv   )ry   r   r  rU  )r   r   r  r   r   r9  float32rU  )r=  r   r   X_sparses       r$   test_count_binary_occurrencesr  N  s   '"IF3777D9%%--//A000$2L2L2N2NOOO91=== F3tDDDD9%%--//A91=== F3t2:VVVD!!),,H>RZ''''''r&   c                  v   ddg} t          ddd           }|                    |           }t          j        |dd         j                  dk    sJ t          j        |dd	         j                  d	k    sJ |j        t          j        k    sJ t          ddd
d           }|                    |           }t          j        |j                  dk    sJ |j        t          j        k    sJ t          ddd
d t          j                  }|                    |           }|j        t          j        k    sJ d S )Nr  r  Fr~   )alternate_signry   r  r   rv   r   rw   T)ry   r  r  r  )ry   r  r  r  rU  )r
   r   r9  rX  rW  rU  float64)r=  r   r   s      r$   test_hashed_binary_occurrencesr  b  sE    '"IEFNNNDy!!A6!AaC&+!####6!AaC&+!####7bj     d  D 	y!!A6!&>>Q7bj     dRZ  D 	y!!A7bj      r&   c                    t           } |             }|                    |          }|                    |          }t          |t                    sJ |                                }t          ||          D ]j\  }}t          j        t          j	         ||                              }t          j        t          j	        |                    }t          ||           kt          j        |          sJ |j        dk    sJ |                                }	|                    |	          }
t          ||
          D ]9\  }}t          t          j        |          t          j        |                     :|                                }|                    |          }t          ||          D ]9\  }}t          t          j        |          t          j        |                     :d S )Ncsr)r   r   r   r   r   rn   zipr9  sortuniquer   r   issparseformatr  tocsc)rD   rW  r   transformed_datainversed_dataanalyzedocinversed_termsr   transformed_data2inversed_data2terms2transformed_data3inversed_data3terms3s                  r$   !test_vectorizer_inverse_transformr  }  s    DJ!//55001ABBMmT*****''))G"477 2 2^	''#,,//00>!:!:;;5.1111?+,,,,,"e++++ )0022112CDDN]N;; < <v275>>276??;;;; )..00112CDDN]N;; < <v275>>276??;;;;< <r&   c                     t           t          z   } dgt          t                     z  dgt          t                    z  z   }t          | |dd          \  }}}}t	          dt                      fdt                      fg          }dd	gd
d}t          ||dd          }|                    ||          	                    |          }	t          |	|           |j        dk    sJ |j        j        d         }
|
j        dk    sJ d S )Nr'  rv   g?r   	test_sizerandom_stater   svcrv   rv   ru   hingesquared_hinge)vect__ngram_range	svc__lossr   )n_jobsr   r   )r   NOTJUNK_FOOD_DOCSr   r   r   r   r   r   r   predictr   best_score_best_estimator_r   rz   rW  targetr<  r=  target_traintarget_testpipeline
parametersgrid_searchpredbest_vectorizers              r$   -test_count_vectorizer_pipeline_grid_selectionr    s-   --D TC'''1#4E0F0F*FFF 8Hf!8 8 84J	< &/"3"34uikk6JKLLH %f-/ J xA!DDDK ??:|44<<YGGDt[)))
 "c))))!1=fEO&&000000r&   c                  :   t           t          z   } dgt          t                     z  dgt          t                    z  z   }t          | |dd          \  }}}}t	          dt                      fdt                      fg          }dd	gd
dd}t          ||d          }|                    ||          	                    |          }	t          |	|           |j        dk    sJ |j        j        d         }
|
j        dk    sJ |
j        dk    sJ |
j        rJ d S )Nr'  rv   g?r   r  r   r  r  ru   )r2  r  r  )r  
vect__normr  )r  r   r  )r   r  r   r   r   r   r   r   r   r  r   r  r  r   rz   r  r:  r  s              r$   'test_vectorizer_pipeline_grid_selectionr    sQ   --D TC'''1#4E0F0F*FFF 8Hf!8 8 84J	< &/"3"34uikk6JKLLH %f-"/ J xA>>>K ??:|44<<YGGDt[)))
 "c))))!1=fEO&&00004''''000000r&   c                  *   t           t          z   } dgt          t                     z  dgt          t                    z  z   }t          dt	                      fdt                      fg          }t          || |d          }t          |g d           d S )Nr'  rv   r   r  r   )r   r  )r   r  r   r   r   r   r   r   )rW  r  r  	cv_scoress       r$   )test_vectorizer_pipeline_cross_validationr    s    --D TC'''1#4E0F0F*FFF&/"3"34uikk6JKLLH$1===Iy///22222r&   c                  t   d} t                      }|                    | g          }|j        dk    sJ t          d d          }|                    | g          }|j        dk    sJ |j        |j        k    sJ t          t          j        |j	                  t          j        |j	                             d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)rv      F)r  r  )rv   i   )
r   r   r   r
   r   rS  r   r9  r  rW  )r   r   	X_countedX_hasheds       r$   test_vectorizer_unicoder    s    	1  D""H:..I?g%%%%$u===D~~xj))H>Z'''' =HL(((( rwy~..0F0FGGGGGr&   c                     ddg} t          |           }|                    t                    }|                    t                    }t	          |                                |                                           |j        sJ d S )Nr   ra  r   )r   r   r   r   r   r  r:  )r   r   X_1X_2s       r$   +test_tfidf_vectorizer_with_fixed_vocabularyr    su    8$Jj111D


]
+
+C
..
'
'CckkmmS[[]];;;!!!!!!r&   c                     t                      t          d          t          d          t          d          t                      t          t                    t          t                    t          t                                        t
                    t          t          	                              t
                    t                      t          t                    t                                          t
                    g} | D ]}t          j	        |          }t          j
        |          }t          |          |j        k    sJ |                                |                                k    sJ t          rt          |t                     rt!          |                    t
                    |                    t
                               d S )
Nr2  r3  T)r  ru   rz   ra   )ry   rG   )r
   r   r   r3   r   r   r+   r   pickledumpsloadstype	__class__
get_paramsr   r   r   r   )	instancesorigr#   copys       r$   test_pickling_vectorizerr    s   t$$$&&&f---Z000...Z00044^DDl33377GG...n--I   L|ADzzT^++++  DOO$5$55555 	z$(9:: 	("">22"">22    r&   factoryc                     t                      } | |          }d}t          j        t          j        |                    } ||          } ||          }||k    sJ dS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    rI   N)r   r  r  r  )r  vecfunctionrp   roundtripped_functionr@   rM  s          r$   test_pickling_built_processorsr  6  so     

Cws||HGD"Lh)?)?@@x~~H""4((FXr&   c                     t           j                            d          } t          j        g d          }t	          dd          D ]}t          |                     |dd                    }t          |          }t          j	        t          j
        |                    }|                    t                     |                    t                     t          |                                |                                           d S Nr   r`  d   r   F)sizer*   r   )r9  randomRandomStatearrayrY  r   choicer   r  r  r  r   r   r   r   )rngvocab_wordsx	vocab_setr   unpickled_cvs         r$   -test_countvectorizer_vocab_sets_when_picklingr  K  s     )


"
"C(
	
 
	
 
	
 K 1c]] 
 


;Q
FFGG		222|FL$4$455
}'''$$&&(J(J(L(L	
 	
 	
 	

 
r&   c                  Z   t           j                            d          } t          j        g d          }t	          dd          D ]}t                      }|                     |dd          }t	          dd          D ]}||||         <   t          |          }t          j	        t          j
        |                    }|                    t                     |                    t                     t          |                                |                                           d S r  )r9  r  r  r  rY  r   r  r   r  r  r  r   r   r   r   )r  r  r  
vocab_dictr   yr   r  s           r$   .test_countvectorizer_vocab_dicts_when_picklingr  g  s%   
)


"
"C(
	
 
	
 
	
 K 1c]] 
 
VV


;Q
>>q! 	% 	%A#$JuQx  
333|FL$4$455
}'''$$&&(J(J(L(L	
 	
 	
 	

 
r&   c                     t                                          t                    t          t                                        t                    t          t
                                        t                    f} | D ]}|                    t                                                    }d |_        |                    t                                                    }t          |d           |                    t                                                    }t          ||           t          ||           d S )Nra   rG   rn  )r   r   r   r   r   r+   r   r  rn  delattrr   )fitted_vectorizersr   vect_transformstop_None_transformstop_del_transforms        r$   test_stop_words_removalr    s    	n--Z00044^DDl33377GG # 
? 
?77??AA"nn^<<DDFFm$$$!^^N;;CCEE.???-~>>>>
? 
?r&   c                     t                                          t                    } t                                          |           }t          j        |          }t          j        |          }t          |          |j	        k    sJ t          |                    |                                           |                    |                                                      d S r    )r   r   r   r   r   r  r  r  r  r  r   r  )r   r  r#   r  s       r$   test_pickling_transformerr     s    ''77A!!!$$DTA<??D::''''t))!,,44668J8J18M8M8U8U8W8WXXXXXr&   c                  |   t                                          t                    } t                                          |           }t                      }|j        |_        t          |                    |                                           |                    |                                                      d S r    )	r   r   r   r   r   r4  r   r   r  )r   r  r  s      r$   test_transformer_idf_setterr    s    ''77A!!!$$DD	DIt~~a((0022DNN14E4E4M4M4O4OPPPPPr&   c                     t          d          } |                     t                     t          | j        d          }| j        |_        t          |                    t                                                    |                     t                                                               t          | j        d          }d}t          j	        t          |          5  | j        |_        d d d            d S # 1 swxY w Y   d S )NTr5  r   r$  Fz+`idf_` cannot be set when `user_idf=False`.r   )r   r   r   r   r4  r   r   r  r   r   r   )r  r  r   s      r$   test_tfidf_vectorizer_setterr    s(   4(((DHH^d&6EEED	DI~&&..00~&&..00  
 d&6FFFD;G	z	1	1	1  I	                 s   $C>>DDc                  F   t          d          } |                     t                     t          | j        d          }t	          | j                  }dg|dz   z  }t          j        t                    5  t          |d|           d d d            d S # 1 swxY w Y   d S )NTr5  r  r   rv   r4  )
r   r   r   r   r   r4  r   r   r   setattr)r   r  expected_idf_leninvalid_idfs       r$   %test_tfidfvectorizer_invalid_idf_attrr
    s    4(((DHH^d&6EEED49~~%+a/0K	z	"	" + +fk***+ + + + + + + + + + + + + + + + + +s   7BBBc                      g d} t          |           }t          j        t                    5  |                    g            d d d            d S # 1 swxY w Y   d S )N)r?   r  r  r?   r?   r   r   r   s     r$   test_non_unique_vocabr    s    %%%Ee,,,D	z	"	"                   s   AAAc                      d} t           }d }t          j        ||           5   |             d d d            d S # 1 swxY w Y   d S )Nz?np.nan is an invalid document, expected byte or unicode string.c                  f    t                      } |                     dt          j        dg           d S )Nhello worldhello hello)r
   r   r9  nan)hvs    r$   funcz0test_hashingvectorizer_nan_in_docs.<locals>.func  s0      
-?@@@@@r&   r   )r   r   r   )r  	exceptionr  s      r$   "test_hashingvectorizer_nan_in_docsr    s     PGIA A A 
y	0	0	0                   s   ;??c                  p   t          ddd           } | j        sJ |                     ddg                                          }t	          |                                g d           |                     ddg                                          }t	          |                                g d           d S )NTF)r  r$  r  r  r  )rv   rv   rv   r   )r   r  r   r  r   ravelr   )r   r   r   s      r$   test_tfidfvectorizer_binaryr    s    tU>>>A8OOO	677??AAAqwwyy,,,///	
m]3	4	4	<	<	>	>Brxxzz<<<00000r&   c                      t          d          } |                     t                     t          | j        | j        j                   d S )NTr5  )r   r   r   r   r4  rP  )r   s    r$   test_tfidfvectorizer_export_idfr    sA    4(((DHH^di)9:::::r&   c                      t          dg          } t          |           }|                     t                     |                    t                     |j        | j        k    sJ d S )Nr.  r   )r   r   r   r   r   )
vect_vocabvect_vocab_clones     r$   test_vectorizer_vocab_cloner    se     UG444JZ((NN=!!!'''':+AAAAAAAr&   c                    d} |             }t          j        t          |          5  |                    d           d d d            n# 1 swxY w Y   t          j        t          |          5  |                    d           d d d            n# 1 swxY w Y   |                    ddg           t          j        t          |          5  |                    d           d d d            d S # 1 swxY w Y   d S )NzBIterable over raw text documents expected, string object received.r   zhello world!	some textzsome other text)r   r   r   r   r   r   )rD   r  r  s      r$   &test_vectorizer_string_object_as_inputr!    s    SG
*,,C	z	1	1	1 * *.)))* * * * * * * * * * * * * * * 
z	1	1	1                                 GG[+,---	z	1	1	1 & &n%%%& & & & & & & & & & & & & & & & & &s5   A

AA0BBBC22C69C6X_dtypec                     t          j        dd| d          }t                                          |          }|j        |j        k    sJ d S N
   i N  *   )rU  r  )r   randr   r   rU  )r"  r   X_transs      r$   test_tfidf_transformer_typer)    sN    BW2>>>A  ..q11G=AG######r&   c                  b   t          j        ddt          j        d          } t          j        |           }t          j        |           }t                                          |          }t                                          |          }t          ||           |j	        |j	        k    sJ d S r$  )
r   r'  r9  r  
csc_matrix
csr_matrixr   r   r   r  )r   X_cscX_csrX_trans_cscX_trans_csrs        r$   test_tfidf_transformer_sparser1    s    BRZbAAAAa  Ea  E"$$22599K"$$22599K k:::!3333333r&   z0vectorizer_dtype, output_dtype, warning_expectedTFc                    t          j        g d          }t          |           }d}|rIt          j        t
          |          5  |                    |          }d d d            n# 1 swxY w Y   nZt          j                    5  t          j	        dt
                     |                    |          }d d d            n# 1 swxY w Y   |j
        |k    sJ d S )N)numpyscipysklearnrU  z'dtype' should be used.r   r   )r9  r  r   r   r  r  r   r  r  r  rU  )vectorizer_dtypeoutput_dtypewarning_expectedr   r   warning_msg_matchX_idfs          r$   test_tfidf_vectorizer_typer<    s_    	...//A '7888J1 0\+->??? 	0 	0,,Q//E	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 $&& 	0 	0!';777,,Q//E	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 ;,&&&&&&s$   A((A,/A,0CC
Cr  )rw   rv   r  c                    | j         }t          j        d| d          }t          | t                    rt
          rt          j        d           t          j        t          |          5  | 
                    dg           d d d            n# 1 swxY w Y   t          j        t          |          5  |                     dg           d d d            n# 1 swxY w Y   t          | t                    rLt          j        t          |          5  |                     dg           d d d            d S # 1 swxY w Y   d S d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.*HashingVectorizer is not supported on PyPy)reasonr   zgood news everyone)rz   reescaper   r
   r   r   xfailr   r   r   r   r   )r  invalid_ranger  s      r$   $test_vectorizers_invalid_ngram_rangerD  0  s    OMi	9 	9 	9 	9 G #()) Jg JHIIII	z	1	1	1 ( (%&'''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( 
z	1	1	1 2 2/01112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 #()) 2]:W555 	2 	2MM/0111	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	22 2s6   ,BBB5CCCD77D;>D;c                     |                                  }|                                 }|                                 }|                     |||          S r    )r   build_tokenizerr;  _check_stop_words_consistency)	estimatorr   tokenize
preprocesss       r$   rG  rG  N  sM    ))++J((**H--//J22:z8TTTr&   c                     d} d| z  }t                      t                      t                      fD ]x}|                    g d           t	          j        t          |          5  |                    dg           d d d            n# 1 swxY w Y   |`t          |          du sJ yt          j                    5  t          j        dt                     |                    dg           d d d            n# 1 swxY w Y   t          |          J |                    g d	           t	          j        t          |          5  |                    dg           d d d            d S # 1 swxY w Y   d S )
Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.)you'veyouyou'llANDr   r   r  Fr   )rL  rM  rN  blahrO  )r   r   r
   r   r   r  r  r   _stop_words_idrG  r  r  r  )lstrr  r  s      r$   'test_vectorizer_stop_words_inconsistentrS  U  sT   #D	')-	. 
  !!?#4#46G6I6IJ ; ;"D"D"DEEE\+W555 	/ 	/}o...	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ ,S11U::::: 
	 	"	" + +g{333=/***+ + + + + + + + + + + + + + + )--555 NNHHHNIII	k	1	1	1 + +=/***+ + + + + + + + + + + + + + + + + +s6   &B		B	B	=1C::C>C>	E--E14E1c                  N   t          j        dt          j                  } t          j        }| j                            |          | _        | j                            |          | _        dddd}t                                          | |          }||j        j	        k    sJ dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )r   r   r6  r   rv   rw   )zscikit-learnrU   zgreat!N)
r   r,  r9  int64indicesastypeindptrr   _sort_featuresrU  )r   INDICES_DTYPEr   Xss       r$   7test_countvectorizer_sort_features_64bit_sparse_indicesr\  q  s     	&111A HM	  //AIx}--AH"#1::J				)	)!Z	8	8BBJ,,,,,,,r&   	Estimatorc                    ddig} |             }t          |          du sJ  | d dg          }t          |          dk    sJ t          |          J |                    |            G d d	|           } |dg
          }t          |          dk    sJ  | d dg          }t          |          du sJ d S )Nrp   r   Tc                     | d         S Nrp   r2   r  s    r$   <lambda>z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>  s
    1V9 r&   and)rb   r   r   c                       e Zd Zd ZdS )Ftest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc                     d S )Nc                     | d         S r`  r2   ra  s    r$   rb  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>  s
    QvY r&   r2   )selfs    r$   r;  zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor  s    &&&r&   N)__name__
__module____qualname__r;  r2   r&   r$   CustomEstimatorre    s#        	' 	' 	' 	' 	'r&   rl  r   c                 P    t          j        d                              |           S )Nz\w{1,})r@  compilefindallr  s    r$   rb  z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>  s    bj33;;C@@ r&   )rm   r   )rG  r   )r]  rW  r  rl  s       r$   -test_stop_word_validation_custom_preprocessorrq    s*   
 [!"D
)++C(--5555
)!4!4%
I
I
IC(--8888(--555d' ' ' ' ') ' ' ' /eW
-
-
-C(--8888
)@@eW  C )--555555r&   zinput_type, err_type, err_msgfilenamer=   r[   z$'str' object has no attribute 'read'c                    t          | t                    rt          rt          j        d           dg}t          j        ||          5   | d |                              |           d d d            d S # 1 swxY w Y   d S )Nr>  "this is text, not file or filenamer   c                 *    |                                  S r    r-   ra  s    r$   rb  z.test_callable_analyzer_error.<locals>.<lambda>  s    QWWYY r&   ry   r\   )
issubclassr
   r   r   rB  r   r   )r]  
input_typeerr_typer   rW  s        r$   test_callable_analyzer_errorrz    s     ).// CG CABBB01D	xw	/	/	/ V V	..jAAAOOPTUUUV V V V V V V V V V V V V V V V V Vs   
"A99A= A=)marksry   c                 "    t          | d          S )Nr)openrp  s    r$   rb  rb    s    T#s^^ r&   c                 *    |                                  S r    )readrp  s    r$   rb  rb    s     r&   rx  c                     dg}t          j        t          t          f          5   | ||                              |           d d d            d S # 1 swxY w Y   d S )Nrt  rv  )r   r   FileNotFoundErrorAttributeErrorr   )r]  ry   rx  rW  s       r$   &test_callable_analyzer_change_behaviorr    s     11D	)>:	;	; K K	8:666DDTJJJK K K K K K K K K K K K K K K K K Ks   !AAAc                 j   d }t          |t                    rt          rt          j        d           |                     d          }|                    d           t          j        t          d          5   ||d          	                    |g           d d d            d S # 1 swxY w Y   d S )	Nc                      t          d          )Ntesting)	Exceptionrp  s    r$   ry   z6test_callable_analyzer_reraise_error.<locals>.analyzer  s    	"""r&   r>  zfile.txtzsample content
r  r   r[   rv  )
rw  r
   r   r   rB  joinwriter   r  r   )tmpdirr]  ry   fs       r$   $test_callable_analyzer_reraise_errorr    s   
# # # ).// CG CABBBJAGG	y		2	2	2 F F	86222@@!EEEF F F F F F F F F F F F F F F F F Fs   9"B((B,/B,zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgrL  rN  r  r~   z'stop_words'
'analyzer'	!= 'word'c                 *    |                                  S r    r-   r"   s    r$   rb  rb        aggii r&   z'tokenizer'c                 *    |                                  S r    r-   r"   s    r$   rb  rb    r  r&   \w+rs   'token_pattern'zis not Nonec                 *    |                                  S r    r!   r"   s    r$   rb  rb    r  r&   c                 *    |                                  S r    r  r"   s    r$   rb  rb    r  r&   z'preprocessor'zis callableru   c                 *    |                                  S r    r  r"   s    r$   rb  rb    r  r&   z'ngram_range')	NNNr  r  r~   r  r  r  c
                    t           }
 |             }|                    ||||||           d|d|d|	}t          j        t          |          5  |                    |
           d d d            d S # 1 swxY w Y   d S )N)r   rm   rb   rz   r   ry   zThe parameter z will not be used since  r   )r   r   r   r  r  r   )rD   r   rm   rb   rz   r   ry   unused_name	ovrd_nameovrd_msgr<  r   r   s                r$   test_unused_parameters_warnr    s    p  J:<<DOO!#      			C
 
k	-	-	-                   s   A66A:=A:zVectorizer, Xrv   rw   )r   barr   )r   bazc                      |             }t          |d          rJ |                    |           t          |d          rJ d S )Nn_features_in_)r8  r   )rD   r   r   s      r$   test_n_features_inr  I  sU     Jz#344444NN1z#34444444r&   c                      t          d          } |                     ddg          j        }|                     ddg          j        }||k    sJ d S )Nrv   rs  helloworld)r   r   r   )r  vocab1vocab2s      r$   )test_tie_breaking_sample_order_invariancer  X  s]     q
)
)
)CWWgw'((4FWWgw'((4FVr&   c                  z    t          dd          } |                     dg          j        }|d         dk    sJ d S )Ni@B )rw   r   )rT  rz   z22pcs efuturer   )r
   r   rV  )hashingrV  s     r$   2test_nonnegative_hashing_vectorizer_result_indicesr  a  sD      7GGGG 122:G1:??????r&   c                 >     |             }t          |d          rJ dS )z0Check that vectorizers do not define set_output.
set_outputN)r8  )r]  r   s     r$   'test_vectorizers_do_not_have_set_outputr  i  s+    
 )++CsL)))))))r&   )collections.abcr   r@  r   r  r4  r   sklearn.feature_extraction.textr   r   r	   r
   r   r   r   r   sklearn.model_selectionr   r   r   sklearn.pipeliner   sklearn.svmr   sklearn.baser   r3  r9  numpy.testingr   r   sklearn.utilsr   sklearn.utils._testingr   r   r   r   collectionsr   	functoolsr   r  ior   r   r  r   r%   r+   r/   r3   rA   rC   markparametrizerq   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r!  r%  rN  rQ  r^  rj  rq  r  r  r  r  r  r  r  r  r  r  r  r  rn   r;  rF  r  r  r  r  r   r  r  r
  r  r  r  r  r  r!  r  r  r)  r1  int32rU  r<  rD  rG  rS  r\  rq  r  r  rz  paramr  r  r  r  r  r  r  r2   r&   r$   <module>r     s   # # # # # # 				         6 6 6 6 6 6 A A A A A A ? ? ? ? ? ? = = = = = = ; ; ; ; ; ; < < < < < < ; ; ; ; ; ; > > > > > > 4 4 4 4 4 4 3 3 3 3 3 3 0 0 0 0 0 0 % % % % % % ! ! ! ! ! !           3 3 3 3 3 3 , , , , , , ! ! ! ! ! !            $ # # # # #               !22, , ,       $ $ $!0 !0 !0H. . .* 9J'KLL:  :  ML: z     <  && & &4& & &() ) ).& & &*
+ 
+ 
+& & &$ $ $0 0 0E E E& & &4 4 4&  &) ) )*< < <   & & &&  d d dN5 5 5: #? #? #?LD/ D/ D/N 'IJJ9 9 KJ9&: : :4& & &.& & &.( ( (( ! ! !4 'IJJ< < KJ<>!1 !1 !1H$1 $1 $1N
3 
3 
3 H H H0" " "  < &*' 
 
 

 
 
8
 
 
:? ? ?,Y Y YQ Q Q   + + +     1 1 1; ; ;B B B ?O5FG & & & RZ$<==$ $ >=$4 4 4 6	2:t$	2:t$	RZ'	RZ'	 ' ' ' 	f---F+++F+++ 2 2 2,U U U + + +6 - - -0 /?4EF 6 6  62 /?4EF  #	&+	!GH V V  V &m<<<  ++-C-CD  
';<<K K =<  K /?4EF F F F ?$5G  1 x 
	
 
	
 
	
 
	
 
	

	
qCG GP QG G VB 	Qq111Q3G3GHI	.) 5 5 5      /?4DFWX * * * * *r&   