
    cF                     (   d Z ddlmZ ddlmZ ddlZddlZddlZddlmZm	Z	m
Z
mZ ddlmZmZ  ej        e          Z G d de	j                  Z G d	 d
ej                  Zedk    r' ej        ej                    ej                     dS dS )z
This module replicates the miislita vector spaces from
"A Linear Algebra Approach to the Vector Space Model -- A Fast Track Tutorial"
by Dr. E. Garcia, admin@miislita.com

See http://www.miislita.com for further details.

    )division)with_statementN)utilscorporamodelssimilarities)datapathget_tmpfilec                   T    e Zd Z ed                                          Zd Zd ZdS )CorpusMiislitazfor a of the and to in onc              #      K   |                                  D ]F}d t          j        |                                                                          D             V  GdS )z
        Parse documents from the .cor file provided in the constructor. Lowercase
        each document and ignore some stopwords.

        .cor format: one document per line, words separated by whitespace.

        c                 .    g | ]}|t           j        v|S  )r   stoplist).0words     9lib/python3.11/site-packages/gensim/test/test_miislita.py
<listcomp>z,CorpusMiislita.get_texts.<locals>.<listcomp>(   s4     < < <D>#::<4 < < <    N)	getstreamr   
to_unicodelowersplit)selfdocs     r   	get_textszCorpusMiislita.get_texts   s}       >>## 	< 	<C< <E$4S$9$9$?$?$A$A$G$G$I$I < < < < < < <	< 	<r   c                     d| j         vrJt                              d           t          d |                                 D                       | _        | j        S )z'Define this so we can use `len(corpus)`lengthz5caching corpus size (calculating number of documents)c              3      K   | ]}d V  dS )   Nr   )r   _s     r   	<genexpr>z)CorpusMiislita.__len__.<locals>.<genexpr>/   s"      ::Aa::::::r   )__dict__loggerinfosumr   r   )r   s    r   __len__zCorpusMiislita.__len__+   sT    4=( 	;KKOPPP::)9)9:::::DK{r   N)__name__
__module____qualname__setr   r   r   r'   r   r   r   r   r      sN        s.446677H
< 
< 
<    r   r   c                        e Zd Zd Zd Zd ZdS )TestMiislitac                    t          t          d                    }t          d          }t          j                            ||           |                     t          j        	                    |                     t          j        |          }| 
                    t          |          t          |                     dS )z0Make sure TextCorpus can be serialized to disk. zhead500.noblanks.cor.bz2ztest_textcorpus.mmN)r   r	   r
   r   MmCorpussave_corpus
assertTrueospathexistsassertEquallist)r   miislitaftmp	miislita2s       r   test_textcorpuszTestMiislita.test_textcorpus4   s     "(+E"F"FGG /00$$T8444t,,--- $T**	hi99999r   c                 r   t          d          }t          |          }t          d          }|                    |           t                              |          }|                     t          |          t          |                     |                     |j        j        |j        j                   dS )z
        Make sure we can save and load (un/pickle) TextCorpus objects (as long
        as the underlying input isn't a file-like object; we cannot pickle those).
        miIslita.corztc_test.cpickleN)	r	   r   r
   saveloadr5   len
dictionarytoken2id)r   
corpusnamer7   tmpfr9   s        r   test_save_load_abilityz#TestMiislita.test_save_load_abilityB   s     n--
!*-- ,--d"''--	XI777,5y7K7TUUUUUr   c                    t          t          d                    }t          j        ||j        d          }t          j        ||         t          |j                            }d}|j                            |	                                
                                          }||         }||         }g d}t          |          D ]"\  }	}
|                     ||	         |
d           #d S )Nr<   F)	normalize)num_featureszlatent semantic indexing)g        gMb?g	h"lx?gׁ?g-!lV?   )r   r	   r   
TfidfModelr@   r   SparseMatrixSimilarityr?   doc2bowr   r   	enumerateassertAlmostEqual)r   r7   tfidfindexqueryvec_bow	vec_tfidf
sims_tfidfexpectedivalues              r   test_miislita_high_levelz%TestMiislita.test_miislita_high_levelT   s    !(>":":;; !(H,?5QQQ3E(ORUV^ViRjRjkkk +%--ekkmm.A.A.C.CDD'N	 9%
 988!(++ 	< 	<HAu"":a=%;;;;	< 	<r   N)r(   r)   r*   r:   rD   rW   r   r   r   r-   r-   3   sD        : : :V V V$< < < < <r   r-   __main__)level)__doc__
__future__r   r   loggingr2   unittestgensimr   r   r   r   gensim.test.utilsr	   r
   	getLoggerr(   r$   
TextCorpusr   TestCaser-   basicConfigDEBUGmainr   r   r   <module>rf      s;           % % % % % %  				  7 7 7 7 7 7 7 7 7 7 7 7 3 3 3 3 3 3 3 3		8	$	$    W'   .4< 4< 4< 4< 4<8$ 4< 4< 4<n z Ggm,,,,HMOOOOO r   