
    c                     z   d Z ddlmZmZ ddlZddlZddlZddlZddlZddl	Z	ddl
ZddlZddlZ	 ddlmZ n# e$ r dZY nw xY wdZdgZddgZg dZd	Zd
 Zd Zd Z G d d          Zedk    rK ej        dej                    ej        dd                    ej                             ej                             ej        d                   Z! e"ej                  dk     r4 e# e$            d          e%            z              ej&        d           ej        d         Z'e'(                    d          r)ej)        *                     ej+        e'                    Z,nej)        *                    e'          Z, e"ej                  dk    r e-ej        d                   Z.ne,j/        Z. e"ej                  dk    r e-ej        d                   Z0ne,j1        Z0 ej        de.e0            ee,e.e0          Z2ej3        4                    e0          Z5 ej        d            ej6        e0e0fej7                  Z8ej3        9                    e2d          D ]hZ: e;d e:D                       Z<ej=        >                    e:e<e0 e"e:          ej?                  Z:e:e:j@        z  Z:e:A                                Z:e8e:z  Z8[:i ej        d            e8B                    ej?                  Z8ejC        D                    e8          \  ZEZFeEddd!         ZEeFj@        ddd!         j@        ZF ejG        e'd"z   eE           eD ]\ZH ejI        eFdddeHf          ejI         ejJ        eEdeH                   eFdddeHf         j@                             ZKeKe8z  ZKejC        L                    eK          ZM[KeEeHdz            ZN e#d#d$eHeMeNfz              e#d%d&'            ed(e8 ej6        e0eHf           ej6        eH          eMeN           er ej        d)eH            ej                    ZOej=        >                    e2e0*          ZP eePeH          \  ZQZRZS ej                    eOz
  ZO[P[SeQj@        B                    ej?                  eRB                    ej?                  dz  cZTZR[Q e#d+eHeOeRd         eRd!         fz              ed,e8eTeReMeN           [TeD ]ZUeD ]ZV ej        d-eHeUeV            ej                    ZOeUejW        jX        _Y        ejW        Z                    e2e5eHeVeU.          Z[ ej                    eOz
  ZOe[j\        jT        B                    ej?                  e[j\        jR        B                    ej?                  dz  cZTZR[[ e#d/eHeUeVeOeRd         eRd!         fz              ed0e8eTeReMeN           [T ej        d1eHeU            ej                    ZOejW        Z                    e2e5eHd2d	eU3          Z[ ej                    eOz
  ZOe[j\        jT        B                    ej?                  e[j\        jR        B                    ej?                  dz  cZTZR[[ e#d4eHeUeOeRd         eRd!         fz              ed5e8eTeReMeN           [T^ ej        d6e!           dS dS )7a  USAGE: %(program)s MATRIX.mm [CLIP_DOCS] [CLIP_TERMS]

Check truncated SVD error for the algo in gensim, using a given corpus. This script
runs the decomposition with several internal parameters (number of requested factors,
iterative chunk size) and reports error for each parameter combination.

The number of input documents is clipped to the first CLIP_DOCS. Similarly,
only the first CLIP_TERMS are considered (features with id >= CLIP_TERMS are
ignored, effectively restricting the vocabulary size). If you don't specify them,
the entire matrix will be used.

Example: ./svd_error.py ~/gensim/results/wiki_en_v10k.mm.bz2 100000 10000
    )print_functionwith_statementN)	sparsesvdi,  i'  i  )r               Fc                     t           rXt          j        dt          | j                             t
          j                            |                                           S t          j
        S )z3Spectral norm ("norm 2") of a symmetric matrix `a`.z&computing spectral norm of a %s matrix)COMPUTE_NORM2logginginfostrshapescipylinalgeigvalshmaxnpnan)as    5lib/python3.11/site-packages/gensim/test/svd_error.pynorm2r   3   sN     =s17||LLL|$$Q''++---v    c                     t          j        dt          j        | |                                           z  | j        z            S )Ng      ?)r   sqrtmultiplysumsize)diffs    r   rmser    <   s6    73T40044666BCCCr   c                 ~   t          j        |t          j        t          j        |          |j                             }||z  }t           j                            |          t          |          }}t          d| |||z  |||z  t          |          fz             t          j
                                         d S )NzF%s error: norm_frobenius=%f (/ideal=%g), norm2=%f (/ideal=%g), RMSE=%g)r   dotdiagTr   normr   printr    sysstdoutflush)	nameaatusideal_nfideal_n2errnfn2s	            r   print_errorr3   @   s    6!RVBGAJJ,,--
-C3JCY^^C  %**B	P	r2="b8mT#YY?	@   Jr   c                       e Zd Zd Zd ZdS )ClippedCorpusc                 2    || _         ||c| _        | _        d S N)corpusmax_docs	max_terms)selfr8   r9   r:   s       r   __init__zClippedCorpus.__init__L   s    (0)%t~~~r   c              #   p    K   t          j         j         j                  D ]} fd|D             V  d S )Nc                 4    g | ]\  }}|j         k     ||fS  )r:   ).0fwr;   s      r   
<listcomp>z*ClippedCorpus.__iter__.<locals>.<listcomp>R   s-    @@@daQ-?@Aq6@@@r   )	itertoolsislicer8   r9   )r;   docs   ` r   __iter__zClippedCorpus.__iter__P   sX      #DK?? 	A 	AC@@@@c@@@@@@@	A 	Ar   N)__name__
__module____qualname__r<   rG   r?   r   r   r5   r5   K   s7        < < <A A A A Ar   r5   __main__z)%(asctime)s : %(levelname)s : %(message)s)formatlevelz
running %s r   __doc__r   bz2   z"using %i documents and %i featureszcomputing corpus * corpus^T)dtypei  )	chunksizec              #   4   K   | ]}t          |          V  d S r7   )len)r@   rF   s     r   	<genexpr>rV   u   s(      003c#hh000000r   )num_nnz	num_termsnum_docsrR   z1computing full decomposition of corpus * corpus^tz.spectrum.npyz(****************************************z4%i factors, ideal error norm_frobenius=%f, norm_2=%fz****************************** )endbaselinez$computing SVDLIBC SVD for %i factors)rX   z8SVDLIBC SVD for %i factors took %s s (spectrum %f .. %f)SVDLIBCzKcomputing incremental SVD for %i factors, %i power iterations, chunksize %i)id2word
num_topicsrS   power_itersz_incremental SVD for %i factors, %i power iterations, chunksize %i took %s s (spectrum %f .. %f)zincremental SVDz;computing multipass SVD for %i factors, %i power iterationsi  )r_   r`   rS   onepassra   zOmultipass SVD for %i factors, %i power iterations took %s s (spectrum %f .. %f)zmultipass SVDzfinished running %s)]rO   
__future__r   r   r   osr'   timerP   rD   numpyr   scipy.linalgr   gensimr   ImportErrorFACTORS	CHUNKSIZEPOWER_ITERSr   r   r    r3   r5   rH   basicConfigINFOr   joinargvpathbasenameprogramrU   r&   globalslocalsexitfnameendswithcorporaMmCorpusBZ2FilemmintnrY   mrX   r8   utilsFakeDictr_   zerosfloat64r+   grouperchunkr   rW   matutils
corpus2cscfloat32r$   toarrayastyper   eigh
spectrum_s
spectrum_usavefactorsr"   r#   r0   r%   	ideal_fror/   taken
corpus_ramutr-   vtr,   ra   rS   modelslsimodelP2_EXTRA_ITERSLsiModelmodel
projectionr?   r   r   <module>r      s    6 5 5 5 5 5 5 5  				 



  



             #######   III 	%DM	oo   D D D  A A A A A A A A z m1GJRYR^____GLsxx11222gsx{++G
s38}}q ggii	"VVXX-...HQKE~~e ,^$$[S[%7%788^$$U++ s38}}q CK
s38}}q CLGL5q!<<<]2q!$$Fl##A&&GGL.///
"(Aq6
,
,
,C%%f%==  #00%00000**5'QY\Y\]bYcYckmku*vvuEGLDEEE
**RZ
 
 C"\..s33J
DDbD!Jddd#%JBGEO#Z000 = =rvjHWH-vrvgbgj'>R6S6SU_`a`a`ackdkck`kUlUn/o/oppps
INN3''	gk*hNRY[dfnQooppphBJBHa\**HBHg,?,?H	V 	V 	V 	GL?IIIDIKKE33Fa3HHJ!	*g66IB2DIKK%'E4;;rz**AHHRZ,@,@!,CDAqELeQqT1R512 3 3 3K	31iBBB& %	 %	K&  	a[)   "	8C&5..G'[ /   "	e+')00<<e>N>P>W>WXZXb>c>cef>f1Ak9eQqT1R5IJ  
 -sAq)XNNNAGLVX_almmmDIKKEM**Gt; +  E DIKK%'E#%,,RZ88%:J:L:S:STVT^:_:_ab:bDAqED+uadAbE:;  
 KaIxHHHK%	N GL&00000[m1 m1s   7 A A