o
    VA(f"                     @   s<  d Z ddlZddlZeddZeddZdd Zi d	d	d
d
dddddd	dd
dddddddddddddddddddd d!d"d#d$d%d&d'Zi d	dd
dddddddddddddddddd dd"dd#d(d$d(d%d)d&d)Zd*d+ Z	d,d- Z
d.d/ Zd0d1 Zd2d3 ZdS )4z>Tools for working with files in the samtools pileup -c format.    NPileupSubstitution)

chromosomeposreference_basegenotypeconsensus_qualitysnp_qualitymapping_qualitycoverage
read_basesbase_qualitiesPileupIndel)r   r   r   r   r   r   r	   r
   Zfirst_alleleZsecond_alleleZreads_firstZreads_secondZ
reads_diffc                 c   s    t dd t t ttttt t f
}t dd t t ttttt t tttf}| D ]K}|dd  }|d dkrPztdd	 t||D  V  W q" tyO   td
| w ztdd	 t||D  V  W q" tym   td
| w dS )a/  iterate over ``samtools pileup -c`` formatted file.

    *infile* can be any iterator over a lines.

    The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution`
    or :class:`pysam.Pileup.PileupIndel`.

    .. note::

       The parser converts to 0-based coordinates
    c                 S      t | d S N   intx r   ,lib/python3.10/site-packages/pysam/Pileup.py<lambda>0       ziterate.<locals>.<lambda>c                 S   r   r   r   r   r   r   r   r   2   r   N   *c                 S      g | ]\}}||qS r   r   .0r   yr   r   r   
<listcomp>9       ziterate.<locals>.<listcomp>zparsing error in line: `%s`c                 S   r   r   r   r   r   r   r   r    >   r!   )	strr   splitr   zip	TypeErrorpysamZSamtoolsErrorr   )infileZ
conv_substZ
conv_indellinedr   r   r   iterate#   s*   
r*   ACGTZAAZCCZGGZTTZUUUZAGrZGARZCTr   ZTCYZACmZCAMGTkZTGKsSwW)CGZGCATZTAr<   r=   c                 C   s   t |   S )zencode genotypes like GG, GA into a one-letter code.
    The returned code is lower case if code[0] < code[1], otherwise
    it is uppercase.
    )ENCODE_GENOTYPEuppercoder   r   r   encodeGenotype^   s   rB   c                 C   s   t |  S )z|decode single letter genotypes like m, M into two letters.
    This is the reverse operation to :meth:`encodeGenotype`.
    )DECODE_GENOTYPEr@   r   r   r   decodeGenotypef   s   rD   c           	   	      s   dd  dd  fdd}g g }}d}| D ]$}z	|||\}}W n
 t y-   Y  nw || |dkr<|| qd	}|rDt  tt|d
ksPJ d|d }d|}||fS )z*translate indel from vcf to pileup format.c                 S   sL   t t| t|}t|D ]}| | || kr| d|   S q| d| S )z'get common prefix of strings s1 and s2.Nminlenranges1s2nr   r   r   r   	getPrefixq   s   z0translateIndelGenotypeFromVCF.<locals>.getPrefixc                 S   sp   t t| t|}| d |d krdS td| d dD ]}| | || kr0| |d d   S q| | d S )z&get common sufix of strings s1 and s2.r    r   NrE   rI   r   r   r   	getSuffixy   s   z0translateIndelGenotypeFromVCF.<locals>.getSuffixc                    s  | |krdS t |t | krg|| r#d|t | d   t | d fS || r5d|d t |    dfS  || }|| }t |t | t |  }|dk rRt d|t |t ||    t |d fS t |t | k r| |rd| t |d   t |d fS | |rd| d t |  dfS  || }|| }t |t | t | }|dk rt d| t |t ||    t |fS J d)N)r   r   z-%sr   r   r   z+%szsnp?)rG   
startswithendswith
ValueError)variantrefprefixsuffixZsharedrM   rP   r   r   getGenotype   s2   
 


*
 


&z2translateIndelGenotypeFromVCF.<locals>.getGenotypeTr   Fr   zmultiple offsets for indelr   /)rS   appendrG   setjoin)	Zvcf_genotypesrU   rY   	genotypesZoffsetsZis_errorrT   goffsetr   rX   r   translateIndelGenotypeFromVCFm   s,   

)


ra   c                    s8  | j }| j}| j}|g| j  | | }|d }t|dkr%tdt|  |d }|d dkr1dS  fdd|D }|d	dgd  }}| jd
dgd }	|dd}
t|dksgt	dd | jD dkrt
||\}}t||| d||||	|
|dt| dddS td|}d}d}t|||||||	|
||
S )z$convert vcf record to pileup record.r5   r   z%only single genotype per position, %sr   .Nc                    s    g | ]}|d kr t | qS )rZ   r   r   r   Zallellesr   r   r       s     zvcf2pileup.<locals>.<listcomp>ZGQZMQZDPc                 S   s   g | ]}t |qS r   )rG   rc   r   r   r   r       s    r   <rN   )Zcontigr   rU   ZaltrG   rS   r"   getinfomaxra   r   rB   r]   r   )vcfsampler   r   Z	referencedatar^   r   r   r	   r
   r   r`   r   r   r   rd   r   
vcf2pileup   sP   $
rl   c                 c   sN    t  }||  || vrtd| D ]}t||}|r$|V  qdS )a  iterate over a vcf-formatted file.

    *infile* can be any iterator over a lines.

    The function yields named tuples of the type
    :class:`pysam.Pileup.PileupSubstitution` or
    :class:`pysam.Pileup.PileupIndel`.

    Positions without a snp will be skipped.

    This method is wasteful and written to support same legacy code
    that expects samtools pileup output.

    Better use the vcf parser directly.

    zsample %s not vcf fileN)r&   ZVCFZconnectZ
getsamplesKeyErrorZfetchrl   )r'   rj   ri   rowresultr   r   r   iterate_from_vcf   s   

rp   )__doc__collectionsr&   
namedtupler   r   r*   r>   rC   rB   rD   ra   rl   rp   r   r   r   r   <module>   s     		

Y: