o
    Nrf<@                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZmZm	Z	m
Z
mZ d dlmZ dd ZdddZdd	 Zd
d ZdddZdddZdd Zdd ZG dd deZedkrZe  dS dS )    N)Fastawrap_sequence
FetchError
ucsc_split	bed_splitget_valid_filename)defaultdictc                    s  t j j\}}|r|dd  }t jj} jr! fdd}t	 j j
t j j  j| j j d}t \}}|sD| }d}|D ]}||\}	}
}|	d krVqH jr{|
d urf|d urf||
 }nt||	 } jd |ksz jd |k r{qH jrddd	 |	|
||fD }t|}t|d
}n
 jr j}ntj}z/ jr|s jdkr|d d}|t ||	|
| nt ||	|
|D ]}|| qW n t y } zt t!|d d }~ww  jr|"  qH|#  d S )N   c                    s   t  j|  S N)recompileregexsearch)xargs T/var/www/html/software/conda/envs/catlas/lib/python3.10/site-packages/pyfaidx/cli.py<lambda>   s    z write_sequence.<locals>.<lambda>)default_seqZkey_functionZstrict_bounds
split_charfilt_functionZread_long_namesZrebuildFr   .c                 s   s    | ]	}|rt |V  qd S r
   )str).0er   r   r   	<genexpr>'   s    z!write_sequence.<locals>.<genexpr>w
nucleotidez name	start	end	A	T	C	G	N	others
Tz Try setting --lazy.
)$ospathsplitextfastar   r   r   r   Zinvert_matchr   r   evalZheader_functionZlazy	delimiterZ
long_namesZ
no_rebuildsplit_regionskeysZ
size_rangelenZsplit_filesjoinr   openoutsysstdout	transformwritetransform_sequencefetch_sequencer   r   close__exit__)r   _extr   r"   regions_to_fetchsplit_functionheaderregionnamestartendZsequence_lenfilenameoutfileliner   r   r   r   write_sequence	   sZ   .

r?   c              	   c   s   z1|j j| j}| jr)||kr)|d ur)|d ur)|| |d |d  }|jj}n|| || }W n tyI   tj	dj
di t  Y d S w | jrP|j}| jrV|j}| jr[d S | jr_n|sc|rq| jsqdd|jdgV  n
dd|jdgV  t||jD ]}|V  qd S )Nr	   z"warning: {name} not found in file
 >
r   )faidxindexlencauto_strandreverse
complementKeyErrorr+   stderrr.   formatlocals	no_outputZno_namesZ	no_coordsr(   Z
fancy_namer9   r   seq)r   r"   r9   r:   r;   line_lensequencer>   r   r   r   r0   >   s4   
r0   c           	      C   s   t | jd| jd}t| \}}|D ]I}||\}}}| jrG|r&|r&|| }n|s1|s1t|| }n
t|| || }|| j || ||< q| jrZ|| ||  || ||< qd S )NT)mutabler   )	r   r"   r$   r%   mask_with_default_seqr'   r   mask_by_caseZ	lowercase)	r   r"   r5   r6   r8   rnamer:   r;   spanr   r   r   mask_sequence[   s   
 rV   c                 C   s*   | j r| j }t}||fS | j}t}||fS r
   )bedr   regionsr   )r   r5   r6   r   r   r   r%   n   s   r%   c                    sZ  |j j| j}|| || }| jr|j}| jr|j}| jr d S | jdkr2dj|j|j	d |j
dS | jdkrAdj|jt|dS | jdkrt|  tt}| fd	d
t D  |dd}|dd}	|dd}
|dd}|dd}ddd
 | D }djd|j|j	|j
dt S | jdkrdj|j|j	|j
t|dS d S )NrW   z{name}	{start}	{end}
r	   )r9   r:   r;   
chromsizesz{name}	{length}
)r9   lengthr   c                    s   g | ]	}|  |fqS r   )count)r   cssr   r   
<listcomp>   s    z&transform_sequence.<locals>.<listcomp>Ar   TCGN|c                 S   s"   g | ]\}}d  |t|fqS ):)r(   r   )r   kvr   r   r   r_      s   " z5{sname}	{sstart}	{send}	{A}	{T}	{C}	{G}	{N}	{others}
)ZsnameZsstartsend
transposedz{name}	{start}	{end}	{seq}
)r9   r:   r;   rN   r   )rC   rD   rE   rH   rG   rM   r-   rK   r9   r:   r;   r'   r   upperr   intupdatesetpopr(   itemsrL   )r   r"   r9   r:   r;   rO   sZnucsr`   ra   rb   rc   rd   othersr   r]   r   r/   x   s4   


 
r/   c           
      C   s  ddl m} tjddd}|jdtdd |jd	td
dd |d}|d}|d}|jddtddd |jddtddd |jddtddd |jdddd d!d" |jd#d$dd d%d" |jd&d'dd d(d" |jd)d*td d+d, |	 }|jd-d.dd d/d" |jd0d1dd d2d" |jd3d4dd d5d" |jd6d7dd d8d" |jd9d:dd d;d" |jd<d=t
d d>d, |jd?d@td dAd, |jdBdCtdDdEd, |jdFdGtdHdIdJdK |dL}|jdMdNtdOdPd, |jdQdRdd dSd" |	 }|jdTdUdd dVd" |jdWdXdd dYd" |jdZdd d[d" |jd\dd d]d" |jd^d_|d`da ttjdbkr7| s7|  tdb n| r@|| }	n| }	|	jr\|	jrRtjdc |	jr\tjdd |	jsd|	jrjt|	 d S t|	 d S )eNr   )__version__zFetch sequences from FASTA. If no regions are specified, all entries in the input file are returned. Input FASTA file must be consistently line-wrapped, and line wrapping of output is based on input line lengths.zPlease cite: Shirley MD, Ma Z, Pedersen BS, Wheelan SJ. (2015) Efficient "pythonic" access to FASTA files using pyfaidx. PeerJ PrePrints 3:e1196 https://dx.doi.org/10.7287/peerj.preprints.970v1)descriptionepilogr"   z
FASTA file)typehelprX   *z=space separated regions of sequence to fetch e.g. chr1:1-1000)rv   nargsrw   zinput optionszoutput optionszheader optionsz-bz--bedrz1bed file of regions (zero-based start coordinate)z-oz--outr   z"output file name (default: stdout)z-iz--transform)rW   rY   r   rj   zItransform the requested regions into another format. default: %(default)s)rv   choicesrw   z-cz--complement
store_trueFz-complement the sequence. default: %(default)s)actiondefaultrw   z-rz	--reversez*reverse the sequence. default: %(default)sz-yz--auto-strandzQreverse complement the sequence when start > end coordinate. default: %(default)sz-az--size-rangezZselected sequences are in the size range [low, high]. example: 1,1000 default: %(default)s)rv   r~   rw   z-nz
--no-namesz5omit sequence names from output. default: %(default)sz-fz--long-nameszpoutput full (long) names from the input fasta headers. default: headers are truncated after the first whitespacez-tz--no-coordszOomit coordinates (e.g. chr:start-end) from output headers. default: %(default)sz-xz--split-fileszEwrite each region to a separate file (names are derived from regions)z-lz--lazyz>fill in --default-seq for missing ranges. default: %(default)sz-sz--default-seqzDdefault base for missing positions and masking. default: %(default)sz-dz--delimiterzjdelimiter for splitting names to multiple values (duplicate names will be discarded). default: %(default)sz-ez--header-functionzlambda x: x.split()[0]z]python function to modify header lines e.g: "lambda x: x.split("|")[0]". default: %(default)sz-uz--duplicates-actionstop)r   firstlastlongestZshortestzQentry to take when duplicate sequence names are encountered. default: %(default)s)rv   r~   r{   rw   zmatching argumentsz-gz--regexz.*zNselected sequences are those matching regular expression. default: %(default)sz-vz--invert-matchzRselected sequences are those not matching 'regions' argument. default: %(default)sz-mz--mask-with-default-seqz<mask the FASTA file using --default-seq default: %(default)sz-Mz--mask-by-casezBmask the FASTA file by changing to lowercase. default: %(default)sz--no-outputz0do not output any sequence. default: %(default)sz--no-rebuildzMdo not rebuild the .fai index even if it is out of date. default: %(default)sz	--versionversionzprint pyfaidx version number)r}   r   rw   r	   zQ--auto-strand and --complement are both set. Are you sure this is what you want?
zN--auto-strand and --reverse are both set. Are you sure this is what you want?
)pyfaidxrs   argparseArgumentParseradd_argumentr   add_argument_groupFileTypeparse_size_rangeadd_mutually_exclusive_groupcheck_seq_lengthr'   r+   argv
print_helpexit
parse_argsrF   rH   rJ   r.   rG   rR   rS   rV   r?   )
Zext_argsrs   parser_inputoutputr7   namesZmatcherZmaskingr   r   r   r   main   sb   



r   c                 C   s(   | d u r	 | S t | dkrtd| S )Nr	   z/--default-seq value must be a single character!)r'   r   ArgumentTypeError)valuer   r   r   r      s   
r   c              
   C   sX   | du r| S z|  dd ddd\}}W n tttfy#   tw t|t|fS )zK Size range argument should be in the form start,end and is end-inclusive. N r@   	,)replacesplit	TypeError
ValueError
IndexErrorrl   )r   r:   r;   r   r   r   r      s   "r   c                   @   s   e Zd ZdZdddZdd ZdddZd	d
 ZedddZ	dddZ
dd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )CounterzDict subclass for counting hashable objects.  Sometimes called a bag
    or multiset.  Elements are stored as dictionary keys and their counts
    are stored as dictionary values.
    Nc                 K   s   | j |fi | dS )zCreate a new, empty Counter object.  And if given, count elements
        from an input iterable.  Or, initialize the count from another mapping
        of elements to their counts.
        N)rm   )selfiterablekwdsr   r   r   __init__   s   zCounter.__init__c                 C   s   dS )Nr   r   )r   keyr   r   r   __missing__   s   zCounter.__missing__c                 C   s4   |du rt |  tdddS t||  tddS )zList the n most common elements and their counts from the most
        common to the least.  If n is None, then list all element counts.
        Nr	   T)r   rG   )r   )sorted	iteritems
itemgetternlargest)r   nr   r   r   most_common   s   zCounter.most_commonc                 c   s.    |   D ]\}}td|D ]}|V  qqdS )zIterator over elements repeating each as many times as its count.

        If an element's count has been set to zero or is a negative number,
        elements() will ignore it.

        N)r   repeat)r   elemr[   r3   r   r   r   elements   s   zCounter.elementsc                 C   s   t d)Nz@Counter.fromkeys() is undefined.  Use Counter(iterable) instead.)NotImplementedError)clsr   rh   r   r   r   fromkeys  s   zCounter.fromkeysc                 K   s   |dur9t |dr(| r!| j}| D ]\}}||d| | |< qnt| | n| j}|D ]}||dd | |< q-|rB| | dS dS )zLike dict.update() but add counts instead of replacing them.

        Source can be an iterable, a dictionary, or another Counter instance.

        Nr   r   r	   )hasattrgetr   dictrm   )r   r   r   self_getr   r[   r   r   r   rm     s   
zCounter.updatec                 C   s   t | S )zBLike dict.copy() but returns a Counter instance instead of a dict.)r   )r   r   r   r   copy   s   zCounter.copyc                 C   s   || v rt | | dS dS )zGLike dict.__delitem__() but does not raise KeyError for missing values.N)r   __delitem__)r   r   r   r   r   r   $  s   zCounter.__delitem__c                 C   s6   | sd| j j S dtdj|  }d| j j|f S )Nz%s()z, z%r: %rz%s({%s}))	__class____name__r(   map__mod__r   )r   rp   r   r   r   __repr__)  s   zCounter.__repr__c                 C   sN   t |tstS t }t| t|B D ]}| | ||  }|dkr$|||< q|S )z'Add counts from two counters.

        r   
isinstancer   NotImplementedrn   r   otherresultr   newcountr   r   r   __add__8     
zCounter.__add__c                 C   sN   t |tstS t }t| t|B D ]}| | ||  }|dkr$|||< q|S )zF Subtract count, but keep only results with positive counts.

        r   r   r   r   r   r   __sub__E  r   zCounter.__sub__c                 C   sT   t |tstS t}t }t| t|B D ]}|| | || }|dkr'|||< q|S )zHUnion is the maximum of value in either of the input counters.

        r   )r   r   r   maxrn   )r   r   _maxr   r   r   r   r   r   __or__R  s   
zCounter.__or__c                 C   sj   t |tstS t}t }t| t|k r|| } }t| j|D ]}|| | || }|dkr2|||< q|S )z? Intersection is the minimum of corresponding counts.

        r   )r   r   r   minr'   filter__contains__)r   r   _minr   r   r   r   r   r   __and__`  s   

zCounter.__and__r
   )r   
__module____qualname____doc__r   r   r   r   classmethodr   rm   r   r   r   r   r   r   r   r   r   r   r   r      s     


r   __main__)NNr
   )r   r+   Zos.pathr   r   r   r   r   r   r   r   r   collectionsr   r?   r0   rV   r%   r/   r   r   r   r   r   r   r   r   r   r   <module>   s&    
5


9 
