o
    Nrf7<                     @   s  d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZ	d dl
m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ ddlmZ dHddZdd Ze d	d
 Zdd Ze  ej!ddddej"dddddej"ddde#e$ dej"ddde%dej"ddde%dej"dd d!e%dej"d"d#d$e%dej"d%d&d'd(ej"d)d*d+d(ej"d,d-e&dd.ej"d/d0dd(ej"d1d2dd(ej"d3d4dd(ej"d5e#g d6d7d8d9ej"d:d;dd(ej"d<d=d>d?dej"d@dAe#dBdCgdBd.dDdE Z'e  ej!ddddej"dddddej"ddde#e$ dej"d/d0dd(ej"d:d;dd(ej"d<d=d>d?ddFdG Z(dS )I    N)tqdm   )clic           
      C   sx   | d }|  d}|d dd }tt|dd d}d	d
 |D }|d }t|d }t|d }	|||	|fS )aM  
    Convert a line from an epilogos bedfile to vector format.

    Parameters
    -----------
    bedline: [string,....]
        A line from a bedfile broken up into its constituent parts
        (e.g. ["chr1", "1000", "2000", "[1,2,34,5]"])

    Returns
    -------
    An array containing the values associated with that line
    r   	   :c                 S   s   | d S )Nr    xr	   r	   \/var/www/html/software/conda/envs/catlas/lib/python3.10/site-packages/clodius/cli/convert.py<lambda>)   s    z,epilogos_bedline_to_vector.<locals>.<lambda>)keyc                 S   s   g | ]}|d  qS )r   r	   ).0vr	   r	   r   
<listcomp>*   s    z.epilogos_bedline_to_vector.<locals>.<listcomp>r      )stripsplitsortedastliteral_evalint)
bedlines	row_infosbedlinepartsZ	array_strZ	array_valstateschromstartendr	   r	   r   epilogos_bedline_to_vector   s   r!   c                    sh   | d }|  d}|d }t|d }t|d }||d    fddtt|D }||||fS )a  
    Convert a line from a bedfile containing states in categorical data to vector format.

    Parameters
    ----------

    bedline: [string,...]
        A line form a bedfile broken up into its contituent parts
        (e.g. ["chr1", "1000", "2000", "state"]))


    states_dic: {'key':val,...}
        A dictionary containing the states in the file with a corresponding value
        (e.g. {'state1_name': 1, 'state2_name': 2,...})

    Returns
    -------

    Four variables containing the values associated with that line: chrom, start, end, states_vector
    (e.g. chrom = "chr1", start = 1000, end = 2000, states_vector = [1,0,0,0])
    r   r   r   r   r   c                    s   g | ]
}| kr
d ndqS )r   r   r	   )r   indexstater	   r   r   S   s    z,states_bedline_to_vector.<locals>.<listcomp>)r   r   r   rangelen)r   
states_dicr   r   r   r   r    Zstates_vectorr	   r#   r   states_bedline_to_vector3   s   r(   c                   C   s   dS )zW
    Aggregate a data file so that it stores the data at multiple
    resolutions.
    Nr	   r	   r	   r	   r   convertX   s   r)   c                     sl  t d  t !}t d| t|d}t|d}t|
|\}}}|d urFt	|d}dd |D }W d    n1 s@w   Y  nd }|j
D ]}|j|t|j| | t|  ftjdd	 qKd fd
d	}|dkrt| |t||| n8|dkr|d usJ ddd |D fddtt|D }t| |t|||| nt| ||||| |  |}t|d}|d u rt| d d d }t d| t|rt| |dkrdd }ndd }|dkrdd D }tj|t|||||||d ntj|t|||||||d W d    d S W d    d S 1 s/w   Y  d S )Nz
chrom_col:temporary dir:temp.mv5wrc                 S      g | ]	}|  d qS utf8r   encoder   liner	   r	   r   r          z)_bedgraph_to_multivec.<locals>.<listcomp>gzip	fillvaluecompressionc                    s  t  }t  }t  }g }| D ]c}|  }| d  }t|d  }	t|d  }
dd |d d   D }|| ||	 ||
 t|dkrVtd| t|dkratd| t|dkrltd| ||7 }qt|d t|d t|d |fS )Nr   c                 S   s"   g | ]}|d kst |ntjqS )ZNA)floatnpnan)r   fr	   r	   r   r      s    zT_bedgraph_to_multivec.<locals>.bedline_to_chrom_start_end_vector.<locals>.<listcomp>z'Chromosomes don't match in these lines:z+Start positions don't match in these lines:z)End positions don't match in these lines:r   )setr   r   r   addr&   
ValueErrorlist)r   r   Z	chrom_setZ	start_setZend_setZ
all_vectorr   r   r   r   r    Zvector)	chrom_colfrom_pos_colnum_rows
to_pos_col	value_colr	   r   !bedline_to_chrom_start_end_vector   sB   






z@_bedgraph_to_multivec.<locals>.bedline_to_chrom_start_end_vectorepilogosr   z:A row_infos file must be provided for --format = 'states' c                 S   s    g | ]}| d dd qS )r0   r   r   )decoder   )r   Zlner	   r	   r   r      s     c                    s   i | ]} | |qS r	   r	   )r   r   )states_namesr	   r   
<dictcomp>   s    z)_bedgraph_to_multivec.<locals>.<dictcomp>r   z.multires.mv5zoutput_file:	logsumexpc                 S   s   | j | jd ddf}|j}|d}d}|d }t||k r%td||t|< ||}tj|ddj }|d}tj	|||k < ||j}|S )	Nr   r   r   )r   g    חd   z7Error removing nan's when running logsumexp aggregationZaxis)
Treshapeshaper;   Znanminr@   isnansmrL   r<   )r   aZ
orig_shapenaZ	SMALL_NUMZNAN_THRESHOLD_NUMresZnresr	   r	   r   agg   s    


z"_bedgraph_to_multivec.<locals>.aggc                 S   "   | j | jd ddfjddj S Nr   r   r   rN   rO   rP   rQ   sumr
   r	   r	   r   rW        "c                 S   s   g | ]}| d qS r/   )r2   )r   Z
state_namer	   r	   r   r     s    

chromsizesrW   starting_resolution	tile_sizeoutput_filer   N)printtempfileTemporaryDirectoryopjoinh5pyFilecchload_chromsizesopenchrom_ordercreate_datasetmathceilchrom_lengthsr&   r;   r<   cmvbedfile_to_multivecr!   r%   r(   closesplitextexistsosremovecreate_multivec_multireszip) 	filepathsra   assemblyrB   rC   rE   rF   
has_header
chunk_size	nan_valuechromsizes_filenamer_   rD   formatrow_infos_filenamer`   methodtd	temp_filef_out
chrom_infochrom_namesZchrom_sizesr=   r   r   rG   r'   tff_inrW   Zstates_row_infosr	   )rB   rC   rD   rJ   rE   rF   r   _bedgraph_to_multiveca   s   





(	





"

  " $r   r{   Z	FILEPATHSr   )metavarnargsz--output-filez-ozuThe default output file name to use. If this isn't specified, clodius will replace the current extension with .hitile)defaulthelpz
--assemblyz-az6The genome assembly that this file was created against)r   typez--chromosome-colz>The column number (1-based) which contains the chromosome name)r   r   r   z--from-pos-colz@The column number (1-based) which contains the starting positionr   z--to-pos-colz>The column number (1-based) which contains the ending positionr   z--value-colz;The column number (1-based) which contains the actual value   z--has-header/--no-headerz2Does this file have a header that we should ignoreF)r   r   z--chunk-sizez)The size of the chunks to read in at onceg     j@z--nan-valuez The string to use as a NaN value)r   r   r   z--chromsizes-filenamez,A file containing chromosome sizes and orderz--starting-resolutionzbThe base resolution of the data. Used to determine how much space to allocate in the multivec filez
--num-rowsz:The number of rows at each position in the multivec formatz--format)r   rH   r   z'default':chr start end state1_value state2_value, etc; 'epilogos': chr start end [[state1_value, state1_num],[state2_value, state2_num],[etc]]; 'states': chr start end state_namer   )r   r   r   z--row-infos-filenamez<A file containing the names of the rows in the multivec filez--tile-sizez-t   z\The number of data points in each tile.Used to determine the number of zoom levelsto create.z--methodz:The method used to aggregate values (e.g. sum, average...)r[   rL   c                 C   s,   t | |||||||||	|
|||||| d S rb   )r   )r{   ra   r|   Zchromosome_colrC   rE   rF   r}   r~   r   r   r_   rD   r   r   r`   r   r	   r	   r   rs   ,  s&   qrs   c                 C   s  t  }td| t|d}t|d}t||\}	}
}|d ur@t	|d}dd |D }W d    n1 s:w   Y  nd }d}|}|	j
D ]}|j|t|	j| | t| ftjdd	 qIttt| d
dD ]S\}}t|rt|}t| t|
}|D ]/}td|| |	j| }t|| t| f}tj||d||d dd}||| d d |f< qqlt| d ql|  |  |}t|d}dd }tj|t |
||||||d W d    d S 1 sw   Y  d S )Nr*   r+   r,   r-   c                 S   r.   r/   r1   r3   r	   r	   r   r     r5   z'bigwigs_to_multivec.<locals>.<listcomp>r   r6   r7   Zbigwigs)Zdescz	chr_name:r   r[   )summaryz not is_bigwigc                 S   rX   rY   rZ   r
   r	   r	   r   rW     r\   z bigwigs_to_multivec.<locals>.aggr]   )!rd   re   rc   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   r&   r;   r<   r   rA   	enumeratebbiZ	is_bigwigr^   r>   keysintersectionfetchflushrt   rr   ry   rz   )r{   ra   r|   r   r   r`   r   r   r   r   r   rq   r=   r   r_   
resolutionr   Zbw_indexZbw_filer^   Zmatching_chromosomesZchr_nameZchr_lenZ	chr_shapeZarrr   r   rW   r	   r	   r   bigwigs_to_multivec  sn   
*





	"r   rb   ))r   ro   rw   Zos.pathpathrf   rd   rh   numpyr;   r   r   ZclickZclodius.chromosomesZchromosomesrj   Zclodius.multivecZmultivecrr   Znegspy.coordinatesZcoordinatesncZ
scipy.miscmiscrS    r   r!   r(   groupr)   r   commandargumentoptionZChoiceZavailable_chromsizesr   strrs   r   r	   r	   r	   r   <module>   s   
%
 L,(