o
    NrfZ2                     @   s   d dl Z d dlZd dlZd dlZd dlm  mZ d dl	m
Z
mZ d dlmZ dd Z	dddZdd	 Zd
d Zdd Z				dddZdddZ				dddZdS )    N)est_query_size_ixload_bai_index)abs2genomicc           	      C   s  g }d}| j }| j}| j| j }|D ]x}|d tjkr.||| d|d f ||d 7 }q|d tjkrB||| d|d f q|d tjkr\||| d|d f ||d 7 }q|d tjkrv||| d|d f ||d 7 }q|d tj	ks|d tj
kr||d 7 }qt|r|d }|d }|d tjkr|||d  d|d f |d tjkr|||d  d	|d f |d tjkr|||d  d|d f |d tjkr||d	|d f |S )
Nr   X   IDNSH)cigartuplesposquery_lengthpysamZCDIFFappendZCINSZCDELZ	CREF_SKIPZCEQUALZCMATCHlenZ
CSOFT_CLIPZ
CHARD_CLIP)	readsubscurr_posr   Z	readstartZreadendZctupleZfirst_ctupleZlast_ctuple r   Z/var/www/html/software/conda/envs/catlas/lib/python3.10/site-packages/clodius/tiles/bam.pyget_cigar_substitutions   s@   r   c              
      s  |durg }|  D ]\}}||t|gg7 }q
nt| j}	t| j}
t|	}	tt	|	dd |
D }dd |D }
tj
dt|
f }i dg dg dg d	g d
g dg dg dg dg dg dg dg dg dg dg dg }ddd}t|}d}t|
||D ]\} |t||  7 }qd}||krddiS t|
||D ]6\} t|| }|t|krq|| d  }| | }|D ]jrqzd}jrjrd}jrd}j| }|d  |g7  < |d  tj| g7  < |d  tj| g7  < |d
  jg7  < |d  |g7  < |d  jg7  < |d  jg7  < jdk}|rCt||}nd}|sjr~z fd djd!d!d"D }W n tyj   g }Y nw |rtt ||| |d  |g7  < n|d  g 7  < n	|d  |g7  < |d  t!g7  < t"j#}|d  |$d#dg7  < |d  |j% g7  < W n    z|d	  &d$g7  < W q t'y   |d	  dg7  < Y qw q|S )%a  
    Sample reads from the specified region, assuming that the chromosomes
    are ordered in some fashion. Returns an list of pysam reads

    Parameters:
    -----------
    samfile: pysam.AlignmentFile
        A pysam entry into an indexed bam file
    start_pos: int
        The start position of the sampled region
    end_pos: int
        The end position of the sampled region
    chromsize: pandas.Series
        A listing of chromosome sizes. If not provided, the chromosome
        list will be extracted from the the bam file header
    cache:
        An object that implements the `get`, `set` and `exists` methods
        for caching data

    Returns
    -------
    reads: [read1, read2...]
        The list of in the sampled regions
    Nc                 S      g | ]}t |qS r   int.0lengthr   r   r   
<listcomp>^       zload_reads.<locals>.<listcomp>c                 S      g | ]}|d  qS r   r   r   rr   r   r   r   `   r    r   idfromtomdZchrNameZ	chrOffsetZcigarZm1FromZm1ToZm2FromZm2Tomapqztags.HPZstrandvariantsZcigars-+)TFg    NAerrorz,Tile encompasses too much data: {total_size} Z_1Z_2i@  c                    s\   g | ]*}|d    kr kr,n n|d dur|d   r|d |d  j|d  fqS )r      Nr   )islowerquery_sequencer#   endr   startr   r   r      s    
T)Zwith_seqZmatches_onlyZHPZMD)(itemsr   nparray
referenceslengthsctbw	natsortedlistzipZr_Zcumsumr   r   r   r   fetchZis_unmappedZ	is_pairedZis_read1Zis_read2Z
query_nameZreference_startZreference_endZreference_nameZcigarstringr)   r   get_cached_variantsr1   Zget_aligned_pairs
ValueErrorset_cached_variantsr   dicttagsgetZ
is_reverseZget_tagKeyError)samfile	start_posend_pos
chromsizesindex_filenamecachechromsizes_listchromsizer8   r9   Zabs_chrom_offsetsresultsZstrandsidx
total_sizeZcidZMAX_SIZEZ
chr_offsetZseq_nameZreadsZ	id_suffixread_idZ	use_cacher*   rC   r   r2   r   
load_reads4   s   
	





	
YrS   c                 C   s,   d| }| r|  |rt| |S dS )zTry to get variants from a read we've seen before.

    This is useful for ONT reads where there's many variants
    per read and retrieving them takes a while.
    	variants.N)existsjsonloadsrD   )rK   rR   cache_idr   r   r   r?      s   
r?   c                 C   s(   d| }| r|  |t| dS dS )z$Save a set of variants to the cache.rT   N)setrV   dumps)rK   rR   r*   rX   r   r   r   rA      s   
rA   c                    s   |dur"g }|  D ]\}}||t|gg7 }q
tdd |D }n2t| j}t| j}t| j}tt|| t	
|} fdd|D }tt|dd |D }d}tt|| td }	d}
d	g|g|d|	  |||	|
d
}|S )at  
    Get the tileset info for a bam file

    Parameters
    ----------
    tileset: tilesets.models.Tileset object
        The tileset that the tile ids should be retrieved from

    Returns
    -------
    tileset_info: {'min_pos': [],
                    'max_pos': [],
                    'tile_size': 1024,
                    'max_zoom': 7
                    }
    Nc                 S   r!   r"   r   )r   cr   r   r   r     r    z*alignment_tileset_info.<locals>.<listcomp>c                    s   g | ]} | qS r   r   r#   Zref_lengthsr   r   r     r    c                 S   r   r   r   r   r   r   r   r     r       r/   i r   )Zmin_posZmax_pos	max_width	tile_sizerI   max_zoommax_tile_width)r5   r   sumr9   r6   r7   r8   rB   r=   r:   r;   r<   mathceillog)rF   rI   rL   rM   rN   total_lengthr8   r9   r_   r`   ZMAX_TILE_WIDTHtileset_infor   r\   r   alignment_tileset_info   s0   



rh   c              	   C   s   g }t | |}|D ]R}|dd d}	ttt|	dd }
|d dt|
d   }|r>||kr>|dd	| ifg  S t|
d | }|| }t| |||||d
}|||fg7 }q	|S )a  
    Generate tiles from a bigwig file.

    Parameters
    ----------
    tileset: tilesets.models.Tileset object
        The tileset that the tile ids should be retrieved from
    tile_ids: [str,...]
        A list of tile_ids (e.g. xyx.0.0) identifying the tiles
        to be retrieved
    index_filename: str
        The name of the file containing the index
    max_tile_width: int
        How wide can each tile be before we return no data. This
        can be used to limit the amount of data returned.
    cache:
        An object that implements the `get`, `set` and `exists` methods
        for caching data
    Returns
    -------
    tile_list: [(tile_id, tile_data),...]
        A list of tile_id, tile_data tuples
    |r   .r      r^   r/   r-   z1Tile too large, no data returned. Max tile size: )rG   rH   rI   rJ   rK   )rh   splitr<   mapr   rS   )rF   tile_idsrJ   rI   ra   rK   Zgenerated_tilesZtsinfoZtile_idZtile_id_partsZtile_positionZ
tile_widthrG   rH   Z
tile_valuer   r   r   alignment_tiles3  s0   

	ro   c                 C   s   t | }t||S N)r   AlignmentFilerh   )filenamerI   rF   r   r   r   rg   v  s   

rg   c                 C   s0   |s|  d}t j| |d}t||||d |dS )Nz.bai)rJ   )rJ   rI   ra   rK   )r   rq   ro   )rr   rn   rJ   rI   ra   rK   rF   r   r   r   tiles|  s   
rs   )NNN)NNNNrp   )rV   rc   numpyr6   r   Zclodius.tiles.bigwigrs   Zbigwigr:   Zclodius.tiles.tabixr   r   Zclodius.tiles.utilsr   r   rS   r?   rA   rh   ro   rg   r   r   r   r   <module>   s2    )
 5:

C	