
    tf7<                     `   d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZ	d dl
m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ ddlmZ dBdZd Z ej:                         d        Zd ZejA                          ejB                  d	d
d       ejD                  dddd       ejD                  ddd ejF                   ejH                                      ejD                  ddde%       ejD                  ddde%       ejD                  ddde%       ejD                  ddd e%       ejD                  d!d"d#$       ejD                  d%d&d'$       ejD                  d(d)e&d*       ejD                  d+d,d$       ejD                  d-d.d$       ejD                  d/d0d$       ejD                  d1 ejF                  g d2      d3d45       ejD                  d6d7d$       ejD                  d8d9d:d;       ejD                  d<d= ejF                  d>d?g      d>*      d@                                                                                                                               Z'ejA                          ejB                  d	d
d       ejD                  dddd       ejD                  ddd ejF                   ejH                                      ejD                  d+d,d$       ejD                  d6d7d$       ejD                  d8d9d:d;      dA                                                  Z(y)C    N)tqdm   )clic                 6   | d   }|j                         j                  d      }|d   j                  d      d   }t        t        j                  |      d       }|D cg c]  }|d   	 }}|d   }t        |d         }	t        |d	         }
||	|
|fS c c}w )
aM  
    Convert a line from an epilogos bedfile to vector format.

    Parameters
    -----------
    bedline: [string,....]
        A line from a bedfile broken up into its constituent parts
        (e.g. ["chr1", "1000", "2000", "[1,2,34,5]"])

    Returns
    -------
    An array containing the values associated with that line
    r   	   :c                     | d   S )Nr    xs    ]/var/www/html/software/conda/envs/higlass/lib/python3.12/site-packages/clodius/cli/convert.py<lambda>z,epilogos_bedline_to_vector.<locals>.<lambda>)   s
    !A$     )keyr      )stripsplitsortedastliteral_evalint)bedlines	row_infosbedlineparts	array_str	array_valvstateschromstartends              r   epilogos_bedline_to_vectorr%      s     qkGMMO!!$'Eas#B'I s''	2GI%&qad&F&!HEaME
eAh-C5#v&& 's   !Bc                    | d   }|j                         j                  d      }|d   }t        |d         }t        |d         }||d      }t        t	        |            D cg c]  }||k(  rdnd }	}||||	fS c c}w )a  
    Convert a line from a bedfile containing states in categorical data to vector format.

    Parameters
    ----------

    bedline: [string,...]
        A line form a bedfile broken up into its contituent parts
        (e.g. ["chr1", "1000", "2000", "state"]))


    states_dic: {'key':val,...}
        A dictionary containing the states in the file with a corresponding value
        (e.g. {'state1_name': 1, 'state2_name': 2,...})

    Returns
    -------

    Four variables containing the values associated with that line: chrom, start, end, states_vector
    (e.g. chrom = "chr1", start = 1000, end = 2000, states_vector = [1,0,0,0])
    r   r   r   r   r   )r   r   r   rangelen)
r   
states_dicr   r   r"   r#   r$   stateindexstates_vectors
             r   states_bedline_to_vectorr-   3   s    2 qkGMMO!!$'E!HEaME
eAh-CuQx E=B3z?=STE%5.Qa/TMT5#}-- Us   $A<c                       y)zW
    Aggregate a data file so that it stores the data at multiple
    resolutions.
    Nr   r   r   r   convertr/   X   s     	r   c                 D   t        d       t        j                         5 }t        d|       t        j                  |d      }t        j                  |d      }t        j                  |
|      \  }}}|Bt        |d      5 }|D cg c]!  }|j                         j                  d      # }}d d d        nd }|j                  D ]V  }|j                  |t        j                  |j                   |   |z        t#        |       z  ft$        j&                  d       X dfd		}|d
k(  r t)        j*                  | |t,        |||       n|dk(  rJ d       |D cg c]%  }|j/                  d      j1                  d      d   ' }}t3        t#        |            D ci c]  }||   |
 }}t)        j*                  | |t4        ||||       nt)        j*                  | |||||       |j7                          |} t        j                  | d      }!|t        j8                  | d         d   dz   }t        d|       t        j:                  |      rt=        j>                  |       |dk(  rd }"nd }"|dk(  rED #cg c]  }#|#j                  d       }$}#t)        j@                  |!tC        ||      |"||||$       n&t)        j@                  |!tC        ||      |"|||       d d d        y c c}w # 1 sw Y   ExY wc c}w c c}w c c}#w # 1 sw Y   y xY w)Nz
chrom_col:temporary dir:temp.mv5wrutf8gzip	fillvaluecompressionc                    t               }t               }t               }g }| D ]
  }|j                         j                         }|dz
     }t        |dz
           }	t        |dz
           }
|dz
  dz
  z    D cg c]"  }|dk(  st	        |      nt
        j                  $ }}|j                  |       |j                  |	       |j                  |
       t        |      dkD  rt        d|       t        |      dkD  rt        d|       t        |      dkD  rt        d|       ||z  } t        |      d   t        |      d   t        |      d   |fS c c}w )Nr   NAz'Chromosomes don't match in these lines:z+Start positions don't match in these lines:z)End positions don't match in these lines:r   )setr   r   r   floatnpnanaddr(   
ValueErrorlist)r   r   	chrom_set	start_setend_set
all_vectorr   r   r"   r#   r$   fvector	chrom_colfrom_pos_colnum_rows
to_pos_col	value_cols                r   !bedline_to_chrom_start_end_vectorz@_bedgraph_to_multivec.<locals>.bedline_to_chrom_start_end_vector   s   IIeGJ# %--/i!m,E,"234%
Q/0 #9q=9q=83KL %&IE!H2669  e$e$C y>A%$A8  y>A%$Ex  w<!#$CX  f$
3%8 Y"Y"Wa 	 -s    'Eepilogosr!   z:A row_infos file must be provided for --format = 'states' r   r   z.multires.mv5zoutput_file:	logsumexpc                    | j                   j                  | j                  d   ddf      }|j                  }|j                  d      }d}|dz  }t        j                  |      |k  rt        d      ||t        j                  |      <   |j                  |      }t        j                  |d      j                   }|j                  d      }t        j                  |||k  <   |j                  |j                        }|S )	Nr   r
   r   )r
   g    חd   z7Error removing nan's when running logsumexp aggregationaxis)
Treshapeshaper>   nanminrA   isnansmrP   r?   )r   a
orig_shapena	SMALL_NUMNAN_THRESHOLD_NUMresnress           r   aggz"_bedgraph_to_multivec.<locals>.agg   s     CCKKR 34 WW
YYu% 	$-O!99R=#44$Q  $-288B< ZZ
+ll11-//{{5)13T--.ll399- 
r   c                     | j                   j                  | j                  d   ddf      j                  d      j                   S Nr   r
   r   rS   rU   rV   rW   sumr   s    r   rb   z"_bedgraph_to_multivec.<locals>.agg  s8    ss{{AGGAJA#67;;;CEEEr   
chromsizesrb   starting_resolution	tile_sizeoutput_filer   N)"printtempfileTemporaryDirectoryopjoinh5pyFilecchload_chromsizesopenr   encodechrom_ordercreate_datasetmathceilchrom_lengthsr(   r>   r?   cmvbedfile_to_multivecr%   decoder   r'   r-   closesplitextexistsosremovecreate_multivec_multireszip)%	filepathsrk   assemblyrI   rJ   rL   rM   
has_header
chunk_size	nan_valuechromsizes_filenameri   rK   formatrow_infos_filenamerj   methodtd	temp_filef_out
chrom_infochrom_nameschrom_sizesrG   liner   r"   rN   lnestates_namesr   r)   tff_inrb   
state_namestates_row_infoss%      ````     `                        r   _bedgraph_to_multivecr   a   sV   & 
,	"		$	$	& s"#GGB
+			)S)141D1D2
.[+ )(#. H!EFGTTZZ\008G	GH H I++ 		E  IIj66u=@SSTs9~- &&" ! 		&	 &	P Z##*# x%LKL%IRS#CJJv.44T:1=SLS6;C	N6KL,q/1,LJL##(#	 ##1# 	yyS!++il3A6HKnk* 99[!IIk"[ DF X<H .8
!!&)    (({K8$7#'* (({K8$7#'#Ws s HH HN TLV }s ss]   A!LK:&K52K:4B1L%*LL'L4CLLAL5K::L	?LLr   	FILEPATHSr
   )metavarnargsz--output-filez-ozuThe default output file name to use. If this isn't specified, clodius will replace the current extension with .hitile)defaulthelpz
--assemblyz-az6The genome assembly that this file was created against)r   typez--chromosome-colz>The column number (1-based) which contains the chromosome name)r   r   r   z--from-pos-colz@The column number (1-based) which contains the starting positionr   z--to-pos-colz>The column number (1-based) which contains the ending positionr   z--value-colz;The column number (1-based) which contains the actual value   z--has-header/--no-headerz2Does this file have a header that we should ignoreF)r   r   z--chunk-sizez)The size of the chunks to read in at onceg     j@z--nan-valuez The string to use as a NaN value)r   r   r   z--chromsizes-filenamez,A file containing chromosome sizes and orderz--starting-resolutionzbThe base resolution of the data. Used to determine how much space to allocate in the multivec filez
--num-rowsz:The number of rows at each position in the multivec formatz--format)r   rO   r!   z'default':chr start end state1_value state2_value, etc; 'epilogos': chr start end [[state1_value, state1_num],[state2_value, state2_num],[etc]]; 'states': chr start end state_namer   )r   r   r   z--row-infos-filenamez<A file containing the names of the rows in the multivec filez--tile-sizez-t   z\The number of data points in each tile.Used to determine the number of zoom levelsto create.z--methodz:The method used to aggregate values (e.g. sum, average...)rf   rP   c                 :    t        | |||||||||	|
||||||       y rl   )r   )r   rk   r   chromosome_colrJ   rL   rM   r   r   r   r   ri   rK   r   r   rj   r   s                    r   r~   r~   ,  s@    b #r   c                 :   t        j                         5 }t        d|       t        j                  |d      }t        j                  |d      }t        j                  ||      \  }	}
}|Bt        |d      5 }|D cg c]!  }|j                         j                  d      # }}d d d        nd }d}|}|	j                  D ]S  }|j                  |t        j                  |	j                   |   |z        t#        |       ft$        j&                  d       U t)        t+        t-        |             d	
      D ]  \  }}t/        j0                  |      rt/        j2                  |      }t5        |j7                               j9                  t5        |
            }|D ]k  }t        d||       |	j                   |   }t        j                  ||z        t#        |       f}t/        j:                  ||d||d   d      }|||   d d |f<   m t        | d        |j=                          |j?                          |}t        j                  |d      }d }tA        jB                  |tE        |
|      ||||       d d d        y c c}w # 1 sw Y   xY w# 1 sw Y   y xY w)Nr1   r2   r3   r4   r5   r   r6   r7   bigwigs)descz	chr_name:r   rf   )summaryz not is_bigwigc                     | j                   j                  | j                  d   ddf      j                  d      j                   S rd   re   r   s    r   rb   z bigwigs_to_multivec.<locals>.agg  s8    33;;
B2377Q7?AAAr   rg   )#rn   ro   rm   rp   rq   rr   rs   rt   ru   rv   r   rw   rx   ry   rz   r{   r|   r(   r>   r?   r   rB   	enumeratebbi	is_bigwigrh   r<   keysintersectionfetchflushr   r}   r   r   )r   rk   r   r   r   rj   r   r   r   r   r   r|   rG   r   r   ri   
resolutionr"   bw_indexbw_filerh   matching_chromosomeschr_namechr_len	chr_shapearrr   r   rb   s                                r   bigwigs_to_multivecr     s   T 
	$	$	& C
"#GGB
+			)S)363F3F4
0[- )(#. H!EFGTTZZ\008G	GH H I(
++ 		E  IIj66u=@SST	N &&" ! 		 "&d9Y+?&@y!Q 	2Hg}}W% ^^G4
'*:??+<'='J'J$($
 !5 7H+x<(66x@G!%7Z+?!@#i. QI))1gy|UC 47E(OAxK07 	01#	2& 	yyS!	B 	$$;6 3#	
wC
 C
 HH HC
 C
s7   A!J6J;&I?!J#GJ?JJ		JJrl   ))r   rz   r   os.pathpathrp   rn   rr   numpyr>   r   r   clickclodius.chromosomeschromosomesrt   clodius.multivecmultivecr}   negspy.coordinatescoordinatesnc
scipy.miscmiscrZ    r   r%   r-   groupr/   r   commandargumentoptionChoiceavailable_chromsizesr   strr~   r   r   r   r   <module>r      s   
  	      
  !    '<".J 	 	HV 	[;
	 	A	-b--/	0	 	I		 	K		 	I		 	F		 	=
 Dc :d 	7
 
	 	E
 	7	8
)  	G
 
	 	E	uk*	+	% < |%P 	[;
	 	A	-b--/	0	 	7
 	G
 
	K
 < DK
r   