
    c                        d Z ddlZddlZddlZddlmZmZmZm	Z	 ddl
mZ dZedk    rej                            ej        d                   Z ej        e          Z ej        d           ej                            ej        	           e                    d
d                    ej                              eej                  dk     r4 e e            d          e            z              ej        d           ej        dd         \  ZZ ej        !                    ej        "                    e                     s e#d           eej                  dk    r e$ej        d                   Z%neZ%dev Z&dev Z'devZ(e&r ee%e(          Z)de)_*         e	ee'e)          Z+ ej,        e dz   e+d           e)-                    dde           e).                    e dz              e+/                    e dz              de)_*        np e	ee'          Z+e+j)        -                    dde            ej,        e dz   e+d           e+j)        .                    e dz               ej0        e dz             Z)[+ ee dz             Z1 ee1e)d           Z2e2/                    e d!z               ej,        e d"z   e2e1         d           e                    d#e           dS dS )$a  
USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]

Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
bz2-compressed dump of Wikipedia articles, in XML format.

This actually creates three files:

* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
  Matrix Matrix format
* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump

The output Matrix Market files can then be compressed (e.g., by bzip2) to save
disk space; gensim's corpus iterators can work with compressed input, too.

`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
removing tokens that appear in more than 10%% of all documents). Defaults to
100,000.

If you have the `pattern` package installed, this script will use a fancy
lemmatization to get a lemma of each token (instead of plain alphabetic
tokenizer). The package is available at https://github.com/clips/pattern .

Example:
  python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki
    N)
DictionaryHashDictionaryMmCorpus
WikiCorpus)
TfidfModeli __main__z)%(asctime)s : %(levelname)s : %(message)s)format)levelz
running %s    __doc__   zOError: The output directory does not exist. Create the directory and try again.onlinelemmanodebug)id_rangedebugT)	lemmatize
dictionaryz_bow.mmi'  )progress_cnt   g?)no_belowno_abovekeep_nz_wordids.txt.bz2z_corpus.pkl.bz2F)r   )id2word	normalizez.tfidf_modelz	_tfidf.mmzfinished running %s)3r   loggingos.pathossysgensim.corporar   r   r   r   gensim.modelsr   DEFAULT_DICT_SIZE__name__pathbasenameargvprogram	getLoggerloggerbasicConfigrootsetLevelINFOinfojoinlenprintglobalslocalsexitinpoutpisdirdirname
SystemExitint
keep_wordsr   r   r   r   allow_updatewiki	serializefilter_extremessave_as_textsaveload_from_textmmtfidf     Glib/python3.11/site-packages/gensim/scripts/make_wiki_online_nodebug.py<module>rI      s   <   



 K K K K K K K K K K K K $ $ $ $ $ $   z ;0gsx{++GWw''FGJKKKKL---
KKchhsx00111 s38}}q ggii	"VVXX-...1IC7==..// ljjkkk
s38}}q 'S!%%

&
 F7"IW$E J#^ZuEEE
"&
z#zJJJ4)+TFFFF""BEV"WWW'9 9:::		$**+++"'
z#333''cJ['\\\4)+TFFFF$$T,>%>??? /Z.t6H/HII
 
$"	#	#B Jr:>>>E	JJtn$%%% Htk)595IIII
KK%w/////w;0 ;0rG   