
    c                        d Z ddlmZmZ ddlZddlZddlZddlmZ ddl	Z
ddlmZ ddlmZ ddlmZmZ ddlmZmZmZmZ  G d	 d
          Z e e                      Zd  ee          D             Zd Zd Z G d dej                  Z  e!e d          sddZ" e#e de"            G d d          Z$ G d d          Z% edd          Z&d dZ'e(dk    r* ej)        dej*                    ej+        d           dS dS )!zN
Automated tests for checking transformation algorithms (the models package).
    )with_statementdivisionN)
namedtuple)log_capture)utils)doc2veckeyedvectors)datapathget_tmpfiletemporary_filecommon_textsc                   "    e Zd ZddZd Zd ZdS )DocsLeeCorpusFc                 "    || _         || _        d S N)string_tagsunicode_tags)selfr   r   s      8lib/python3.11/site-packages/gensim/test/test_doc2vec.py__init__zDocsLeeCorpus.__init__   s    &(    c                 6    | j         rd|z  S | j        rd|z  S |S )Nu   _¡_%dz_*%d)r   r   )r   is     r   _tagzDocsLeeCorpus._tag   s1     	?" 	A:r   c              #     K   t          t          d                    5 }t          |          D ]B\  }}t          j        t          j        |          |                     |          g          V  C	 d d d            d S # 1 swxY w Y   d S )Nzlee_background.cor)openr
   	enumerater   TaggedDocumentr   simple_preprocessr   )r   fr   lines       r   __iter__zDocsLeeCorpus.__iter__&   s      (/0011 	\Q$Q<< \ \4,U-DT-J-JTYYWX\\N[[[[[[\	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\ 	\s   AB  BBN)FF)__name__
__module____qualname__r   r   r"    r   r   r   r      sK        ) ) ) )  \ \ \ \ \r   r   c                 @    g | ]\  }}t          j        ||g          S r&   r   r   .0r   wordss      r   
<listcomp>r,   /   s+    YYYHAuW#EA3//YYYr   c                      t          d          } t          j        t                      d          }|                    |            t          j                    }|                    |           S )Ngensim_doc2vec.tst   	min_count)r   r   Doc2Vecr   saveload)tmpfmodels     r   load_on_instancer7   2   sY    +,,DOMOOq999E	JJtOE::dr   c                 \    t          j        d t                      D             |            d S )Nc              3   $   K   | ]}|j         V  d S r   )r+   )r*   docs     r   	<genexpr>z3save_lee_corpus_as_line_sentence.<locals>.<genexpr><   s$       F Fs F F F F F Fr   )r   save_as_line_sentencer   corpus_files    r    save_lee_corpus_as_line_sentencer?   ;   s.    	 F Fmoo F F FTTTTTr   c                       e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Z ej        ej        dk    d          d             Z ej        ej        dk    d          d             Zd Zd Zd Zd Zd Zd Zd Zd Zd>dZd Zd Zd Z d Z!d  Z"d! Z#d" Z$d# Z%d$ Z&d% Z'd& Z(d' Z)d( Z*d) Z+d* Z,d+ Z-d, Z.d- Z/d. Z0d/ Z1d0 Z2d1 Z3d2 Z4d3 Z5d4 Z6d5 Z7d6 Z8d7 Z9d8 Z: e;            d9             Z< e;            d:             Z=d; Z>d< Z?d=S )?TestDoc2VecModelc                     t          d          }t          j        t                      d          }|                    |           |                     |t          j                            |                     dS )&Test storing/loading the entire model.r.   r/   r0   N)r   r   r2   r   r3   models_equalr4   )r   r5   r6   s      r   test_persistencez!TestDoc2VecModel.test_persistence@   sf    /001===

4%!5!5d!;!;<<<<<r   c                 h   t          t          d                    5 }t          |           t          d          }t          j        |d          }|                    |           |                     |t          j                            |                     ddd           dS # 1 swxY w Y   dS )rC   r.   r/   )r>   r1   N)r   r   r?   r   r2   r3   rD   r4   )r   r>   r5   r6   s       r   test_persistence_fromfilez*TestDoc2VecModel.test_persistence_fromfileG   s    K(<==>> 	A+,[999344DOqIIIEJJteW_%9%9$%?%?@@@	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	As   A=B''B+.B+c                 h   t          j        t                      d          }t          d          }|                    |ddd           t
          j                            |d          }|                     t          |j
                  t          |j                  z   t          |                     t          d          }|                    |ddd           t
          j                            |d          }|                     t          |j                  t          |                     t          d	          }|                    |ddd           t
          j                            |d          }|                     t          |j
                  t          |                     d
S )z1Test storing the entire model in word2vec format.r/   r0   zgensim_doc2vec.dwTF
doctag_vecword_vecbinary)rL   zgensim_doc2vec.dzgensim_doc2vec.wN)r   r2   r   r   save_word2vec_formatr	   KeyedVectorsload_word2vec_formatassertEquallenwvdv)r   r6   test_doc_wordbinary_model_dvtest_doc	test_words         r    test_persistence_word2vec_formatz1TestDoc2VecModel.test_persistence_word2vec_formatQ   sw   1===#$788""=TDY^"___&3HH_dHeeUXUX6O8L8LMMM122""8uUY"ZZZ&3HHZ^H__UXO(<(<=== 233	""9VZ"[[[&3HH[_H``UXO(<(<=====r   c                    d}t           j                            t          |                    }|                     |j        j        j        dk               |                     t          |j                  dk               |                     t          |j        j	                  dk               | 
                    |j                   |                     |j        j        t          |j                  |j        fk               |                     |j        j        j        dk               |                     |j        j        dk               |                     |j        j        j        dk               |                     |j        j        j        dk               |                     t          |j                  dk               |                     |           dS )	<Test loading an old doc2vec model from indeterminate versiondoc2vec_olds  d   r]   r]   ,  r^   ra   ra   Nr   r2   r4   r
   
assertTruerR   vectorsshaperQ   index_to_keyassertIsNonecorpus_total_wordssyn1negvector_sizevectors_lockf	cum_tablerS   model_sanityr   
model_filer6   s      r   obsolete_testLoadOldModelz*TestDoc2VecModel.obsolete_testLoadOldModeld   s}    #
$$Xj%9%9::(.+=>>>EH-...EH122d:;;;%2333+EHu?P/QQRRR.4@AAA-9:::(.*<===.4?@@@EH,---%     r   c                    d}t           j                            t          |                    }|                     |j        j        j        dk               |                     t          |j                  dk               |                     t          |j        j	                  dk               | 
                    |j                   |                     |j        j        t          |j                  |j        fk               |                     |j        j        j        dk               |                     |j        j        dk               |                     |j        j        j        dk               |                     |j        j        j        dk               |                     t          |j                  dk               |                     |           dS )	rZ   doc2vec_old_sepr\   r]   r_   r`   rb   ra   Nrc   ro   s      r   "obsolete_testLoadOldModelSeparatesz3TestDoc2VecModel.obsolete_testLoadOldModelSeparatesw   s}    '
$$Xj%9%9::(.+=>>>EH-...EH122d:;;;%2333+EHu?P/QQRRR.4@AAA-9:::(.*<===.4?@@@EH,---%     r   c                     d}t           j                            t          |                    }|                     |           g d}|D ]}|                     |           dS )zTest loading pre-1.0 modelszd2v-lee-v0.13.0)
z0.12.0z0.12.1z0.12.2z0.12.3z0.12.4z0.13.0z0.13.1z0.13.2z0.13.3z0.13.4N)r   r2   r4   r
   rn   _check_old_version)r   rp   r6   old_versionsold_versions        r   %obsolete_test_load_old_models_pre_1_0z6TestDoc2VecModel.obsolete_test_load_old_models_pre_1_0   s{    &
$$Xj%9%9::%   
 
 
 ( 	1 	1K##K0000	1 	1r   c                 B    ddg}|D ]}|                      |           dS )zTest loading 1.x modelsz1.0.0z1.0.1Nrv   r   rw   rx   s      r   !obsolete_test_load_old_models_1_xz2TestDoc2VecModel.obsolete_test_load_old_models_1_x   B     W
 ( 	1 	1K##K0000	1 	1r   c                 B    g d}|D ]}|                      |           dS )zTest loading 2.x models)z2.0.0z2.1.0z2.2.0z2.3.0Nr{   r|   s      r   !obsolete_test_load_old_models_2_xz2TestDoc2VecModel.obsolete_test_load_old_models_2_x   E    
 
 
 ( 	1 	1K##K0000	1 	1r   c                 B    g d}|D ]}|                      |           dS )Test loading 3.x models)z3.2.0z3.1.0z3.0.0Nr{   r|   s      r   %obsolete_test_load_old_models_pre_3_3z6TestDoc2VecModel.obsolete_test_load_old_models_pre_3_3   r   r   c                 B    ddg}|D ]}|                      |           dS )r   z3.4.0z3.3.0Nr{   r|   s      r   &obsolete_test_load_old_models_post_3_2z7TestDoc2VecModel.obsolete_test_load_old_models_post_3_2   r~   r   c                    t          j        d|           t          d          }t          j                            |                    |                    }|                     t          |j	                  dk               | 
                    |j                   |                     |j	        j        j        dk               |                     |j        j        j        dk               |                     t          |j                  dk               |                    t!          t#                                d         j                  }|j                            |gt          |j                            }|                     |           t)          d	          }|                    |           t          j                            |          }|                    t!          t#                                d         j                  }|j                            |gt          |j                            }|                     |           d S )
Nz TESTING LOAD of %s Doc2Vec MODELzold_d2v_models/d2v_{}.mdl   )r      )   r   r   r   topnr.   )logginginfor
   r   r2   r4   formatrd   rQ   rR   rh   ri   re   rf   rS   infer_vectorlistr   r+   most_similarr   r3   )r   rx   saved_models_dirr6   doc0_inferredsims_to_inferr5   loaded_models           r   rv   z#TestDoc2VecModel._check_old_version   s   7EEE#$?@@$$%5%<%<[%I%IJJEH*+++%2333(.&8999(.&8999EH*+++**4+@+@+C+IJJ--}oCMM-RR&&&/00

4++D11$11$}2G2G2J2PQQ$44m_3|K_K_4``&&&&&r   c                    t          j        d          }|                    t                     |                     t
          |j        d           |                     t
          |j        d           |                     t
          |j        t          d           |                     t
          |j        d d            |                     t
          |j        t                     d S )N2   )rk   )corpus_iterableig+  r=   test)r   r>   )r   r2   build_vocablist_corpusassertRaises	TypeErrortrain	sentencesr   r6   s     r   test_doc2vec_train_parametersz.TestDoc2VecModel.test_doc2vec_train_parameters   s    B///+666)U[eDDD)U[%HHH)U[)Y_```)U[$TXYYY)U[iHHHHHr   ntz"See another test for Windows belowc                    g d}t          d          }t          j        |dd          5 }|D ])}|                    t          j        |                     *	 d d d            n# 1 swxY w Y   t
          j                            |d          \  }}|                     |dg           |                     |dg           t
          j                            |d          \  }}|                     |dd	g           |                     |ddg           t
          j                            |d
          \  }}|                     |g d           |                     |g d           t
          j                            |d          \  }}|                     |g d           |                     |g d           t
          j                            |d          \  }}|                     |g d           |                     |g d           t
          j                            |d          \  }}|                     |g d           |                     |g d           d S )Nzline1
zline2
zline3
zline4
zline5
r.   wbutf8encodingr/   r   r      r   )r         r   r/   r   r   )r   r   r   r   r   r/   r   r      )r   r   r   r      r   r/   r   r   r   r   )r   r   r   r   r   r   )r   r   r/   r   r   r   	r   r   r   writeany2unicoder   r2   -_get_offsets_and_start_doctags_for_corpusfilerP   r   linesr5   foutr!   offsetsstart_doctagss          r   "test_get_offsets_and_start_doctagsz3TestDoc2VecModel.test_get_offsets_and_start_doctags       HGG/00ZdV444 	4 4 4

5,T2233334	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 ")!^!^_cef!g!g1#&&&,,,!(!^!^_cef!g!g1b'***A///!(!^!^_cef!g!g***---			222!(!^!^_cef!g!g...111555!(!^!^_cef!g!g"4"4"4555888!(!^!^_cef!g!g"7"7"7888(:(:(:;;;;;   -A%%A),A)z See another test for posix abovec                    g d}t          d          }t          j        |dd          5 }|D ])}|                    t          j        |                     *	 d d d            n# 1 swxY w Y   t
          j                            |d          \  }}|                     |dg           |                     |dg           t
          j                            |d          \  }}|                     |dd	g           |                     |ddg           t
          j                            |d
          \  }}|                     |g d           |                     |g d           t
          j                            |d          \  }}|                     |g d           |                     |g d           t
          j                            |d          \  }}|                     |g d           |                     |g d           t
          j                            |d          \  }}|                     |g d           |                     |g d           d S )Nr   r.   r   r   r   r/   r   r      r   )r         r   r   )r   r   r   r   r   r   )r   r   r   r      r   r   )r   r   r   r   r   r   )r   r   r/   r   r   r   r   r   s          r   &test_get_offsets_and_start_doctags_winz7TestDoc2VecModel.test_get_offsets_and_start_doctags_win   r   r   c                 h   g d}t          d          }t          j        |dd          5 }|D ])}|                    t          j        |                     *	 d d d            n# 1 swxY w Y   ddlm} t          j        	                    |d          \  }}t          ||          D ]\  }} |||          }	|	                                }
|                     t          |
          d	           |                     |
d         t          j        |                                                     d S )
Nr   r.   r   r   r   r   )CythonLineSentencer   r/   )r   r   r   r   r   !gensim.models.word2vec_corpusfiler   r   r2   r   zipread_sentencerP   rQ   any2utf8strip)r   r   r5   r   r!   r   r   r   offsetlssentences              r   7test_cython_linesentence_readline_after_getting_offsetszHTestDoc2VecModel.test_cython_linesentence_readline_after_getting_offsets  s   GGG/00ZdV444 	4 4 4

5,T2233334	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	IHHHHH!(!^!^_cef!g!g// 	H 	HLFD##D&11B''))HS]]A...Xa[%.*F*FGGGG		H 	Hr   c                     t          j        t          d          d          }t          d          }	 |                    |ddd           dS # t
          $ r |                     d           Y dS w xY w)	z=Test storing document vectors of a model with unicode titles.T)r   r/   r0   r.   rI   zFailed storing unicode title.N)r   r2   r   r   rM   UnicodeEncodeErrorfailr   r6   r5   s      r   test_unicode_in_doctagz'TestDoc2VecModel.test_unicode_in_doctag,  s    4 @ @ @ANNN/00	7&&ttTX&YYYYY! 	7 	7 	7II5666666	7s   A A32A3c                 X   t          j        t          d          }t          d          }|                    |d           |                     |t           j                            |                     |                     |t           j                            |d                     dS )	rC   r/   r0   r.   r   )	sep_limitr)mmapN)r   r2   r   r   r3   rD   r4   r   s      r   test_load_mmapzTestDoc2VecModel.test_load_mmap5  s    	Q777/00 	

41
%%%%!5!5d!;!;<<< 	%!5!5d!5!E!EFFFFFr   c                    t                      }t          j        d          }|                    |           |                     t          |j        j                  d           |                     |j        d         j        d           |                     |j        t          j
        d                   j        d           |                     t          |j        d           dS ) Test doc2vec doctag alternativesr/   r0   ra   r   r^   _*0N)r   r   r2   r   rP   rQ   rS   re   rf   npint64r   KeyError__getitem__r   corpusr6   s      r   test_int_doctagsz!TestDoc2VecModel.test_int_doctagsA  s    !,,,&!!!UX-..444!*F333"(1++.4f===(E$5u=====r   c                     t          t          d                    }|dd         |z   }t          j        d          }|                    |           |                     t          |j        j        d           dS )r   Tr   
   r/   r0   	not_a_tagN)	r   r   r   r2   r   r   r   rS   r   r   s      r   test_missing_string_doctagz+TestDoc2VecModel.test_missing_string_doctagL  sr    mD))**"&!,,,&!!!(EH$8+FFFFFr   c                 :   t          t          d                    }|dd         |z   }t          j        d                              |           |                     t          j        j                  d           |                     j        d         j	        d           |                     j        d         j	        d           | 
                    t          j        d         j        d         k                         | 
                    t          j        j                                                  t          j        j                  k                |                     t          fd	j        j                                        D                       t          j        j                             |                     j        j        d         j                            j        d         g          d         d                    d
S )r   Tr   r   r/   r0   ra   r   r   c              3   L   K   | ]}j                             |          V  d S r   )rS   	get_index)r*   str_keyr6   s     r   r;   z7TestDoc2VecModel.test_string_doctags.<locals>.<genexpr>e  s3      XX""7++XXXXXXr   N)r   r   r   r2   r   rP   rQ   rS   re   rf   rd   allmaxkey_to_indexvaluesrg   
assertLesskeysr   r   s     @r   test_string_doctagsz$TestDoc2VecModel.test_string_doctagsV  s   mD))**"&!,,,&!!!UX-..444!*F333%.777EHUOux{:;;<<<EH188::;;c%(BW>X>XXYYYXXXX58;P;U;U;W;WXXXXX !!	
 	
 	

 	.q1583H3H%(ST+3W3WXY3Z[\3]^^^^^r   c                     |                      t          t          j        g            |                      t          t          j        t          d           d S )Ni'  r0   )r   RuntimeErrorr   r2   r   r   s    r   test_empty_errorsz"TestDoc2VecModel.test_empty_errorsk  sD    ,<<< 	,PUVVVVVr   c                 "   ddg}dg}t          t          d                    }t          j        d          }|                    |           |                     |                    ||          |                    ||          k               dS )z,Test similarity of out of training sentencesromeitalycarTr/   r0   N)r   r   r   r2   r   rd   similarity_unseen_docs)r   
rome_words	car_wordsr   r6   s        r   test_similarity_unseen_docsz,TestDoc2VecModel.test_similarity_unseen_docsr  s    g&
G	mD))**!,,,&!!!((Z@@**:yAAB	
 	
 	
 	
 	
r   Tc           
         dt          j        d          }d}|                    t          t	                                d         j                  }|j                            |gt          |j                            }d |D             }| 	                    |v d
                    |                     |                              }|                     |d           |j                            t          |j                            }	d |	D                                 |          }
|                     |
d	           |j                 }|j                            |gd
          }fd|D             }|	dd         }	|                     t          t          |	           d         t          t          |           d                    | 	                    t          j        t          t          |	           d         t          t          |           d                              |j                            t          |j                  dz  t          |j                  dz  dz            }d |D             }|D ]R}| 	                    t          |j                  dz  |cxk    ot          |j                  dz  dz  k    nc            S|                     |j                            |          |j                            |                     |                     |j                            ||          |j                            |                     |                     |j                            ||g          |           |rlt%          d          }|                    |           t(          j                            |          }|                    t0          |j        |j                   dS dS )zCAny non-trivial model on DocsLeeCorpus can pass these sanity checksr         r   c                     g | ]\  }}|S r&   r&   r*   docidsims      r   r,   z1TestDoc2VecModel.model_sanity.<locals>.<listcomp>  s    :::jeSE:::r   z{0} not found in {1}r   c                     g | ]\  }}|S r&   r&   r  s      r   r,   z1TestDoc2VecModel.model_sanity.<locals>.<listcomp>  s    000ZUC5000r      r   )positiver   c                 *    g | ]\  }}|k    ||fS r&   r&   )r*   idr  fire1s      r   r,   z1TestDoc2VecModel.model_sanity.<locals>.<listcomp>  s*    ???wr32;?"c???r   N   r/   r   r   )
clip_startclip_endc                     g | ]\  }}|S r&   r&   r  s      r   r,   z1TestDoc2VecModel.model_sanity.<locals>.<listcomp>  s    999u999r   zgensim_doc2vec_resave.tst)r   total_examplesepochs)r   r   r   r   r   r+   rS   r   rQ   rd   r   indexr   rP   r   allclose
similaritydoesnt_matchr   r3   r   r2   r4   r   r   corpus_countr  )r   r6   keep_trainingfire2alt1r   r   sims_idsf_ranksimsf2_rankdoc0_vecsims2	clip_simssims_doc_ids_idr5   loadedr  s                     @r   rn   zTestDoc2VecModel.model_sanity  s    **4+@+@+C+IJJ--}oCMM-RR::M:::)+A+H+HP]+^+^___&&### x$$UUX$??00400066u==$$$ 8E?%%z%CC????%???CRCyc4j))!,d3;.?.?.BCCCDd$4$4Q$7c5k9J9J19MNNOOO H!!%CMMQ4FQTUZU]Q^Q^abQbfgQg!hh 	99y999 	R 	RDOOCMMQ.$PPPP#eh--!:Kq:PPPPPQQQQ 	++E488%(:M:MeUZ:[:[\\\++E488%(:M:MeUZ:[:[\\\ 	..tU/CDDdKKK  	n:;;DJJt_))$//FLL6CV_e_lLmmmmm		n 	nr   c                    t                      }t          j        dddd          }|                    |           |                     |j        j        j        d           |                    ||j	        |j
                   |                     |           t          j        |dddd          }|                     ||           dS )	Test doc2vec training.r^   r   r  r/   rk   r1   r  workersr`   r  r  N)r   r   r2   r   rP   rS   re   rf   r   r  r  rn   rD   )r   r   r6   model2s       r   test_trainingzTestDoc2VecModel.test_training  s    C1RQRSSS&!!!)/<<<F5+=elSSS%    SAbZ[\\\%(((((r   c                    t          t          d                    5 }t          |           t          j        dddd          }|                    |           |                     |j        j        j	        d           |
                    ||j        |j        	           |                     |           t          j        |dddd
          }|                     |           ddd           dS # 1 swxY w Y   dS )r%  r.   r^   r   r  r/   r&  r=   r`   )r>   total_wordsr  )r>   rk   r1   r  r'  N)r   r   r?   r   r2   r   rP   rS   re   rf   r   ri   r  rn   r   r>   r6   s      r   test_training_fromfilez'TestDoc2VecModel.test_training_fromfile  s:   K(<==>> 	%+,[999OqUVWWWE+666UX-3Z@@@KKKU=U^c^jKkkke$$$OXYbdnopppEe$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%s   CC22C69C6c                 n    t          j        t          ddddd          }|                     |           dS )Test DBOW doc2vec training.r   r/   r   r  )dmhsnegativer1   r  Nr   r2   r   rn   r   s     r   test_dbow_hszTestDoc2VecModel.test_dbow_hs  s9    a!qY[\\\%     r   c           	          t          t          d                    5 }t          |           t          j        |ddddd          }|                     |           ddd           dS # 1 swxY w Y   dS )r0  r.   r   r/   r   r  )r>   r1  r2  r3  r1   r  Nr   r   r?   r   r2   rn   r-  s      r   test_dbow_hs_fromfilez&TestDoc2VecModel.test_dbow_hs_fromfile  s    K(<==>> 	%+,[999OaRS_`iklllEe$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%s   ?A))A-0A-c                 v    t          j        t          ddddddddd
  
        }|                     |           d	S )
Test DM/mean doc2vec training.r/   r   r   r   皙?r   r  	r1  dm_meanrk   windowr2  r3  alphar1   r  Nr4  r   s     r   test_dmm_hszTestDoc2VecModel.test_dmm_hs  sJ    Aqb1DAb
 
 
 	%     r   c                     t          t          d                    5 }t          |           t          j        t
          ddddddddd	
  
        }|                     |           d
d
d
           d
S # 1 swxY w Y   d
S )r0  r.   r/   r   r   r   r;  r   r  r<  Nr   r   r?   r   r2   r   rn   r-  s      r   test_dmm_hs_fromfilez%TestDoc2VecModel.test_dmm_hs_fromfile  s    K(<==>> 	%+,[999O1"Qq"  E e$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%   AA22A69A6c                 v    t          j        t          ddddddddd
  
        }|                     |           d	S )
Test DM/sum doc2vec training.r/   r   r   r   r;  r   r  r<  Nr4  r   s     r   test_dms_hszTestDoc2VecModel.test_dms_hs  sJ    Aqbqda
 
 
 	%     r   c                     t          t          d                    5 }t          |           t          j        t
          ddddddddd	
  
        }|                     |           d
d
d
           d
S # 1 swxY w Y   d
S )r0  r.   r/   r   r   r   r;  r   r  r<  NrB  r-  s      r   test_dms_hs_fromfilez%TestDoc2VecModel.test_dms_hs_fromfile  s    K(<==>> 	%+,[999O1"Q1$!B  E e$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%rD  c                 v    t          j        t          ddddddddd
  
        }|                     |           d	S )
%Test DM/concatenate doc2vec training.r/   r   r   r   r;  r   r  	r1  	dm_concatrk   r>  r2  r3  r?  r1   r  Nr4  r   s     r   test_dmc_hszTestDoc2VecModel.test_dmc_hs  sJ    A11DAb
 
 
 	%     r   c                     t          t          d                    5 }t          |           t          j        t
          ddddddddd	
  
        }|                     |           d
d
d
           d
S # 1 swxY w Y   d
S )r0  r.   r/   r   r   r   r;  r   r  rL  NrB  r-  s      r   test_dmc_hs_fromfilez%TestDoc2VecModel.test_dmc_hs_fromfile  s    K(<==>> 	%+,[999OQBqq"  E e$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%rD  c           	      p    t          j        t          dddddd          }|                     |           dS )r0     r   r   r   (   rk   r1  r2  r3  r1   r  Nr4  r   s     r   test_dbow_negzTestDoc2VecModel.test_dbow_neg  s=    aRS_`iklll%     r   c           
          t          t          d                    5 }t          |           t          j        t
          dddddd          }|                     |           ddd           dS # 1 swxY w Y   dS )	r0  r.   rR  r   r   r   rS  rT  NrB  r-  s      r   test_dbow_neg_fromfilez'TestDoc2VecModel.test_dbow_neg_fromfile  s    K(<==>> 	%+,[999OKRA!VWcdmopppEe$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%s   AA//A36A3c                 v    t          j        t          ddddddddd	
  
        }|                     |           d
S )r:  r/   r   r   r   r   r;  r   r  r<  Nr4  r   s     r   test_dmm_negzTestDoc2VecModel.test_dmm_neg  J    Aqbqtq
 
 
 	%     r   c                     t          t          d                    5 }t          |           t          j        t
          ddddddddd	

  
        }|                     |           ddd           dS # 1 swxY w Y   dS )r0  r.   r/   r   r   r   r   r;  r   r  r<  NrB  r-  s      r   test_dmm_neg_fromfilez&TestDoc2VecModel.test_dmm_neg_fromfile"      K(<==>> 	%+,[999O1"Q141R  E e$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%rD  c                 v    t          j        t          ddddddddd	
  
        }|                     |           d
S )rF  r/   r   r   r   r   r;  r   r  r<  Nr4  r   s     r   test_dms_negzTestDoc2VecModel.test_dms_neg,  rZ  r   c                     t          t          d                    5 }t          |           t          j        t
          ddddddddd	

  
        }|                     |           ddd           dS # 1 swxY w Y   dS )r0  r.   r/   r   r   r   r   r;  r   r  r<  NrB  r-  s      r   test_dms_neg_fromfilez&TestDoc2VecModel.test_dms_neg_fromfile4  r]  rD  c                 v    t          j        t          ddddddddd	
  
        }|                     |           d
S )rK  r/   r   r   r   r   r;  r   r  rL  Nr4  r   s     r   test_dmc_negzTestDoc2VecModel.test_dmc_neg>  sJ    A1tq
 
 
 	%     r   c                     t          t          d                    5 }t          |           t          j        t
          ddddddddd	

  
        }|                     |           ddd           dS # 1 swxY w Y   dS )r0  r.   r/   r   r   r   r   r;  r   r  rL  NrB  r-  s      r   test_dmc_neg_fromfilez&TestDoc2VecModel.test_dmc_neg_fromfileF  s    K(<==>> 	%+,[999OQBqQ41R  E e$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%rD  c                 x    t          j        t          dddddddddd	
          }|                     |           dS )z1Test DMM doc2vec training with fixed window size.r   r/   r   Fr   r   r;  r   r  )
rk   r1  r=  r>  shrink_windowsr2  r3  r?  r1   r  Nr4  r   s     r   test_dmm_fixedwindowsizez)TestDoc2VecModel.test_dmm_fixedwindowsizeP  sM    R!Ae2TQr
 
 

 	%     r   c                     t          t          d                    5 }t          |           t          j        |ddddddddd	d
          }|                     |           ddd           dS # 1 swxY w Y   dS )z<Test DMM doc2vec training with fixed window size, from file.r.   r   r/   r   Fr   r   r;  r   r  )r>   rk   r1  r=  r>  rg  r2  r3  r?  r1   r  Nr7  r-  s      r   !test_dmm_fixedwindowsize_fromfilez2TestDoc2VecModel.test_dmm_fixedwindowsize_fromfileY  s    K(<==>> 	%+,[999O'Ra%r2  E
 e$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%s   AA..A25A2c           
      r    t          j        t          ddddddd          }|                     |           dS )	z2Test DBOW doc2vec training with fixed window size.rR  Fr   r   r   r  )rk   rg  r1  r2  r3  r1   r  Nr4  r   s     r   test_dbow_fixedwindowsizez*TestDoc2VecModel.test_dbow_fixedwindowsized  sF    RQa
 
 
 	%     r   c                     t          t          d                    5 }t          |           t          j        |ddddddd          }|                     |           d	d	d	           d	S # 1 swxY w Y   d	S )
z=Test DBOW doc2vec training with fixed window size, from file.r.   rR  Fr   r   r   r  )r>   rk   rg  r1  r2  r3  r1   r  Nr7  r-  s      r   "test_dbow_fixedwindowsize_fromfilez3TestDoc2VecModel.test_dbow_fixedwindowsize_fromfilel  s    K(<==>> 	%+,[999O'RQ!B  E e$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%s   AA++A/2A/c                     t          j        t                      d          }t          j        |dd          }|                     |           dS )z@Test doc2vec parallel training with more than default 3 threads.ip  r   i  )r'  batch_wordsN)r   RepeatCorpusr   r   r2   rn   r   s      r   test_parallelzTestDoc2VecModel.test_parallelv  sL     #MOOT:: tDDD%     r   c                     t          j        t                      dd          }t          j        t                      dd          }|                     ||           dS )7Test doc2vec results identical with identical RNG seed.*   r/   )seedr'  Nr   r2   r   rD   r   r6   r)  s      r   test_deterministic_hsz&TestDoc2VecModel.test_deterministic_hs  sT     b!DDDr1EEE%(((((r   c                     t          j        t                      dddd          }t          j        t                      dddd          }|                     ||           dS )rt  r   r   ru  r/   )r2  r3  rv  r'  Nrw  rx  s      r   test_deterministic_negz'TestDoc2VecModel.test_deterministic_neg  s^     ATUVVVQUVWWW%(((((r   c                     t          j        t                      dddddddd	  	        }t          j        t                      dddddddd	  	        }|                     ||           dS )rt  r/   r   r   r   ru  )r1  rM  rk   r>  r2  r3  rv  r'  Nrw  rx  s      r   test_deterministic_dmcz'TestDoc2VecModel.test_deterministic_dmc  s     OOQBQR
 
 
 OOQBQR
 
 
 	%(((((r   c                 H   d t          t                    D             }t          j                    }|                    |           t          t                    t          |j        j                  z   }| 	                    t          |j        j
                  |           dS )zAEnsure alternating int/string tags don't share indexes in vectorsc                 N    g | ]"\  }}t          j        |||d          g          #S )r   r(   r)   s      r   r,   z9TestDoc2VecModel.test_mixed_tag_types.<locals>.<listcomp>  s2    rrrXQPUG251eAh-HHrrrr   N)r   raw_sentencesr   r2   r   rQ   r   rS   r   rP   re   )r   mixed_tag_corpusr6   expected_lengths       r   test_mixed_tag_typesz%TestDoc2VecModel.test_mixed_tag_types  s    rrYbcpYqYqrrr!!*+++i..3ux/D+E+EEUX-..@@@@@r   c                    |                      t          |j                  t          |j                             |                     t	          j        |j        j        |j        j                             |j        r2|                     t	          j        |j        |j                             |j	        r2|                     t	          j        |j
        |j
                             |                      t          |j                  t          |j                             |                      t          |j        j                  t          |j        j                             d S r   )rP   rQ   rR   rd   r   r  re   r2  syn1r3  rj   rS   rg   rx  s      r   rD   zTestDoc2VecModel.models_equal  s   UXFI777EH$4fi6GHHIII8 	BOOBK
FK@@AAA> 	HOOBKv~FFGGGUXFI777UX233S9O5P5PQQQQQr   c                     t           j                            t          d                    }|d         }|                     t
                    5  |dz  }d d d            d S # 1 swxY w Y   d S )Nword2vec_pre_kv_csaysr   )r	   rN   rO   r
   r   
ValueError)r   r6   vectors      r   test_word_vec_non_writeablez,TestDoc2VecModel.test_word_vec_non_writeable  s    )>>xH[?\?\]]vz** 	 	aKF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   A""A&)A&c                     ddg}d t          |          D             }t          j                    }|                    |           d}|                     |t          |          v            dS )zFTest if logger warning is raised on non-ideal input to a doc2vec modelhumanmachinec                 @    g | ]\  }}t          j        ||g          S r&   r(   r)   s      r   r,   z=TestDoc2VecModel.test_build_vocab_warning.<locals>.<listcomp>  +    aaaHAuW+EA377aaar   zAEach 'words' should be a list of words (usually unicode strings).N)r   r   r2   r   rd   str)r   loglinesr  r   r6   warnings         r   test_build_vocab_warningz)TestDoc2VecModel.test_build_vocab_warning  ss     !),aa	R_H`H`aaa	!!)$$$U3x==011111r   c                    dgddgg}d t          |          D             }t          j        ddddd	          }|                    |           t	          d
          D ]V}|                    ||j        |j                   |xj        dz  c_        |j        |_	        |dk    r|xj        dz  c_        Wd}| 
                    |t          |          v            dS )zKTest if warning is raised if alpha rises during subsequent calls to train()r  graphtreesc                 @    g | ]\  }}t          j        ||g          S r&   r(   r)   s      r   r,   z7TestDoc2VecModel.test_train_warning.<locals>.<listcomp>  r  r   g?r/   r   r   )r?  	min_alphar1   r'  rk   r   r(  gMb`?r;  z6Effective 'alpha' higher than previous training cyclesN)r   r   r2   r   ranger   r  r  r?  r  rd   r  )r   r  r  r   r6   epochr  s          r   test_train_warningz#TestDoc2VecModel.test_train_warning  s     "!7+-aa	R_H`H`aaa	euSTbcddd)$$$2YY 	$ 	$EKK	%2DU\KZZZKK5 KK#kEOz $t#J3x==011111r   c                 F    |                      t          t                     dS )zBTest if exception is raised when loading doc2vec model on instanceN)r   AttributeErrorr7   r   s    r   test_load_on_class_errorz)TestDoc2VecModel.test_load_on_class_error  s    .*:;;;;;r   c                 >   t          j        t          ddd          }t          d          }|                    |           t           j                            |          }|                    t          |j        d           |j        dk    sJ |j                    dS )z@The model should accept a negative ns_exponent as a valid value.r/   )ns_exponentr1   r'  zd2v_negative_exp.tstr(  N)	r   r2   r   r   r3   r4   r   r  r  )r   r6   r5   r   s       r   test_negative_ns_expz%TestDoc2VecModel.test_negative_ns_exp  s    	rQPQRRR122

4++D119U5GPQRRR'2-GG|/GGGGGGr   NT)@r#   r$   r%   rE   rG   rX   rq   rt   ry   r}   r   r   r   rv   r   unittestskipIfosnamer   r   r   r   r   r   r   r   r   r   rn   r*  r.  r5  r8  r@  rC  rG  rI  rN  rP  rU  rW  rY  r\  r_  ra  rc  re  rh  rj  rl  rn  rr  ry  r{  r}  r  rD   r  r   r  r  r  r  r&   r   r   rA   rA   ?   s       = = =A A A> > >&! ! !&! ! !&1 1 11 1 11 1 11 1 11 1 1' ' '*	I 	I 	I X_RW_&JKK< < LK<B X_RW_&HII< < JI<BH H H"7 7 7
G 
G 
G	> 	> 	>G G G_ _ _*W W W
 
 
.n .n .n .n`) ) )% % %! ! !
% % %! ! !% % %! ! !% % %! ! !% % %! ! !
% % %! ! !% % %! ! !% % %! ! !% % %! ! !	% 	% 	%! ! !% % %! ! !) ) )) ) )) ) )A A A
R 
R 
R   []]2 2 ]2 []]2 2 ]2 < < <H H H H Hr   rA   r   c                 F    |                      ||k     |d|           d S )Nz is not less than )msg)rd   )r   abr  s       r   r   r     s-    Aqqq!!#DEEEEEr   c                   H    e Zd ZdZd Zd Zd Zed             Zd	dZ	d Z
dS )
ConcatenatedDoc2Vecz
    Concatenation of multiple models for reproducing the Paragraph Vectors paper.
    Models must have exactly-matching vocabulary and document IDs. (Models should
    be trained separately; this wrapper just returns concatenated results.)
    c                     || _         t          |d         d          r t          d |D                       | _        d S d S )Nr   rS   c                     g | ]	}|j         
S r&   )rS   r*   r6   s     r   r,   z0ConcatenatedDoc2Vec.__init__.<locals>.<listcomp>  s    *H*H*H58*H*H*Hr   )modelshasattrConcatenatedDocvecsrS   r   r  s     r   r   zConcatenatedDoc2Vec.__init__  sN    6!9d## 	J)*H*H*H*H*HIIDGGG	J 	Jr   c                 N    t          j        fd| j        D                       S )Nc                      g | ]
}|         S r&   r&   r*   r6   tokens     r   r,   z3ConcatenatedDoc2Vec.__getitem__.<locals>.<listcomp>      EEEuU|EEEr   r   concatenater  r   r  s    `r   r   zConcatenatedDoc2Vec.__getitem__  *    ~EEEEEEEFFFr   c                 J    d                     d | j        D                       S )z-Abbreviated name, built from submodels' names+c              3   4   K   | ]}t          |          V  d S r   )r  r  s     r   r;   z.ConcatenatedDoc2Vec.__str__.<locals>.<genexpr>  s(      <<uE

<<<<<<r   )joinr  r   s    r   __str__zConcatenatedDoc2Vec.__str__  s%    xx<<<<<<<<r   c                 &    | j         d         j        S Nr   )r  r  r   s    r   r  zConcatenatedDoc2Vec.epochs  s    {1~$$r   Nc                 Z    t          j        fd| j        D                       S )Nc                 @    g | ]}|                               S r&   )r   )r*   r6   r?  documentr  r  s     r   r,   z4ConcatenatedDoc2Vec.infer_vector.<locals>.<listcomp>  s.    oooZ_u11(E9fUUooor   r  )r   r  r?  r  r  s    ````r   r   z ConcatenatedDoc2Vec.infer_vector   s8    ~ooooooocgcnooopppr   c                     d S r   r&   )r   ignore_argsignore_kwargss      r   r   zConcatenatedDoc2Vec.train  s    r   )NNN)r#   r$   r%   __doc__r   r   r  propertyr  r   r   r&   r   r   r  r    s         J J J
G G G= = = % % X%q q q q    r   r  c                       e Zd Zd Zd ZdS )r  c                     || _         d S r   )r  r  s     r   r   zConcatenatedDocvecs.__init__  s    r   c                 N    t          j        fd| j        D                       S )Nc                      g | ]
}|         S r&   r&   r  s     r   r,   z3ConcatenatedDocvecs.__getitem__.<locals>.<listcomp>  r  r   r  r  s    `r   r   zConcatenatedDocvecs.__getitem__  r  r   N)r#   r$   r%   r   r   r&   r   r   r  r    s7          G G G G Gr   r  SentimentDocumentzwords tags split sentimentTc           	      .   t          j        d|            g d}d |D             }|                    g d           dg}i }t          t          j                            | d          d          5 }t          t          j                            | d          d          5 }t          |           t          |           t          ||          D ]\  }}	|	                    d	          \  }
}t          |
          }
|                                }|D ]\  }}|                    ||          }|		                    d
          \  }}|
t          |          k    sJ ||vr|
t          |          f||<   	 ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   dgdz  }t          t          j                            | d          d          5 }|D ]\}|	                    d          \  }}
|D ]\  }}|                    ||          }|                                |t          |
          <   ]	 ddd           n# 1 swxY w Y   t          dt          j        dz             }t          t          j                            | d          d          5 }t          |           |D ]}|	                    d          \  }
}t          |
          }
t!          |          }||
         }|	                                }|rd |D             }|                    |d          \  }}g d|         } |||
g|||          ||
<   	 ddd           n# 1 swxY w Y   t%          d |D                       t'          |          k    sJ t%          d |D                       dk    sJ t%          d |D                       dk    sJ t%          d |D                       dk    sJ t          j        dt'          |          t'          |          |            |S )a\  
    Read and return documents from the Stanford Sentiment Treebank
    corpus (Rotten Tomatoes reviews), from http://nlp.Stanford.edu/sentiment/

    Initialize the corpus from a given directory, where
    http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
    has been expanded. It's not too big, so compose entirely into memory.
    zloading corpus from %s)   à   á   â   ã   æ   ç   è   é   ír     ï   ñ   ó   ô   ö   û   üc                 b    g | ],}|                     d                               d          |f-S )zutf-8latin1)encodedecode)r*   chars     r   r,   z5read_su_sentiment_rotten_tomatoes.<locals>.<listcomp>"  s7    cccG,,33H==tDcccr   ))   Â      )z-LRB-()z-RRB-)r  zdatasetSentences.txtr   zdatasetSplit.txt	,Ni zdictionary.txt|SentimentPhrasesentence_idzsentiment_labels.txtc                 6    g | ]}|                                 S r&   )lower)r*   words     r   r,   z5read_su_sentiment_rotten_tomatoes.<locals>.<listcomp>T  s     888$888r   r  )Nr   r   devc              3   (   K   | ]}|j         	dV  d S )Nr/   r  r*   phrases     r   r;   z4read_su_sentiment_rotten_tomatoes.<locals>.<genexpr>Y  s*      IIV&*<IqIIIIIIr   c              3   0   K   | ]}|j         d k    dV  dS )r   r/   Nsplitr  s     r   r;   z4read_su_sentiment_rotten_tomatoes.<locals>.<genexpr>[  s.      BBV&,'*ABqBBBBBBr   iS!  c              3   0   K   | ]}|j         d k    dV  dS )r   r/   Nr  r  s     r   r;   z4read_su_sentiment_rotten_tomatoes.<locals>.<genexpr>\  s.      AAV&,&*@AqAAAAAAr   i  c              3   0   K   | ]}|j         d k    dV  dS )r  r/   Nr  r  s     r   r;   z4read_su_sentiment_rotten_tomatoes.<locals>.<genexpr>]  s.      @@V&,%*?@q@@@@@@r   iL  z6loaded corpus with %i sentences and %i phrases from %s)r   r   extendr   r  pathr  nextr   r  intrstripreplacer   r  _fieldsfloatgetsumrQ   )dirname	lowercasechars_sst_mangledsentence_fixupsphrase_fixupsinfo_by_sentencer   splitssentence_line
split_liner
  textjunkfixid2split_iphrasesphrase_linesr!   r  
sentiments	sentimentr+   r  r  s                            r   !read_su_sentiment_rotten_tomatoesr    s    L)7333   dcQbcccO       #OM
 	bgll7$:;;S	A	A @Y"',,w(:;;SAA 	@VOOOLLL-0F-C-C 	@ 	@)z(..t44DWW{{}}!0 3 3ID#<<c22DD!+!1!1#!6!6gSXX~%%%// @.0#g,,-?$T*	@	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@@ @ @ @ @ @ @ @ @ @ @ @ @ @ @  fvoG	bgll7$455s	;	; -|  	- 	-DCJT2* / /	c||D#..#{{}}GCGG		-- - - - - - - - - - - - - - - !!24E4MP`4`aaO	bgll7$:;;S	A	A VZZ 
	V 
	VD"jjooORRBi((I2;DJJLLE 988%888%5%9%9$	%J%J"['2227;E)/%"uiUUGBKK
	VV V V V V V V V V V V V V V V IIwIIIIISQaMbMbbbbbBBwBBBBBdJJJJAAwAAAAATIIII@@w@@@@@DHHHHL@s7||W  
 Ns]   0/FCE?3F?F	FF	FFFA II	IB+MMM__main__z)%(asctime)s : %(levelname)s : %(message)s)r   levelzgensim.test.test_doc2vec)moduler   r  ),r  
__future__r   r   r   r  r  collectionsr   numpyr   testfixturesr   gensimr   gensim.modelsr   r	   gensim.test.utilsr
   r   r   r   r  r   r   r   r   r   r7   r?   TestCaserA   r  r   setattrr  r  r  r  r#   basicConfigDEBUGmainr&   r   r   <module>r.     s    0 / / / / / / /   				 " " " " " "     $ $ $ $ $ $       / / / / / / / / b b b b b b b b b b b b\ \ \ \ \ \ \ \$ d==??## ZY		-@X@XYYY	  U U U\
H \
H \
H \
H \
Hx( \
H \
H \
H@ w.. 8F F F F GlJ777
       <G G G G G G G G J24PQQ R R R Rj z 5GJRYR_````HM34444445 5r   