
    DUfy                        d Z ddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lm Z m!Z! ddl"m#Z#  e$e           d         Z% e$e           d         Z& e$e           d         Z' e$e           d         Z(dZ) ej*        ej+                   	 	 	 	 	 	 	 	 	 	 d"dZ,d Z-d Z.i ddde/fdZ0	 	 	 	 	 	 	 	 	 	 d"dZ1	 	 	 	 d#dZ2	 	 	 	 	 d$d!Z3dS )%a  
This module enables construction of observed over expected pixels tables and
storing them inside a cooler.

It includes 3 main functions.
expected_full - is a convenience function that calculates cis and trans-expected
    and "stitches" them togeter. Such a stitched expected that "covers"
    entire Hi-C heatmap can be easily merged with the pixel table.
expected_full_fast - generated the same output as `expected_full` but ~2x faster.
    Efficiency is achieved through calculating cis and trans expected in one
    pass of the pixel table. Post-processing is not fully implemented yet.
obs_over_exp - is a function that merges pre-calculated full expected with the pixel
    table in pd.DataFrame or dask.DataFrame formats.
obs_over_exp_generator - is a function/generator(lazy iterator) that wraps
    `obs_over_exp` function and yields chunks of observed/expected pixel table.
    Such a "stream" can be used in cooler.create as a "pixels" argument to write
    obs/exp cooler-file.

It also includes 3 helper functions (used in `expected_full_fast`):
make_pairwise_expected_table - a function that creates an empty table for the
    full expected with all the right sizes and number of valid pixels pre-filled.
    Combines functionality of `make_diag_tables` and `make_block_tables` from the API.
sum_pairwise - a function that calculates the full pixel summary for all pairwise
    combinations of the regions in the view, and for each genomic separation for
    cis-combinations of regions. In a nutshell - it calls `make_pairwise_expected_table`
    to generate empty table for expected, and it gets filled by applying `_sum_` to the
    pixels table.
_sum_ - a function that does the actual summing of pixel values grouped by regions
    and genomic separations - can work on a chunk of pixel table.
    N)	partition)teechaincombinations_with_replacement)reducepartial)expected_cisexpected_trans)assign_supportsmake_cooler_view)is_compatible_viewframeis_cooler_balanced)make_block_tablemake_diag_tables)diag_expected_dtypesblock_expected_dtypes)expected_smoothing         )levelF皙?expectedweight逖    c                 J   t          j                    }t          | |d||||||	|

  
        }t          j                    |z
  }t          j        d|dd           t          j                    }t          | |||	|
          }t          |d<   t          j                    |z
  }t          j        d|dd           |                                                    d	d
i          	                    d          }|rdnd}|r| d}|r| d}||         
                                ||<   |rdnd}|r| d}ddg}|r|                    d           |                    |d         j        |d                                                  |d         j        |d                                                  g          |                             d          }|d         |d         z  |d<   |r|d         |d         z  |d<   ||         
                                ||<   t!          j        ||gd          }|d
         j        |d                                                  |d<   |d
         j        |d                                                  |d<   t          j        d           |S )Z  
    Generate a DataFrame with expected for *all* 2D regions
    tiling entire heatmap in clr.
    Such 2D regions are defined as all pairwise combinations
    of the regions in view_df. Average distance decay is calculated
    for every cis-region (e.g. inter- and intra-arms), and
    a "simple" average over each block is caculated for trans-
    regions.

    When sub-chromosomal view is provided, trans averages
    can be aggregated back to the level of full chromosomes.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    view_df : viewframe
        expected is calculated for all pairwise combinations of regions
        in view_df. Distance dependent expected is calculated for cis
        regions, and block-level average is calculated for trans regions.
    smooth_cis: bool
        Apply smoothing to cis-expected. Will be stored in an additional column
    aggregate_smoothed: bool
        When smoothing cis expected, average over all regions, ignored without smoothing.
    smooth_sigma: float
        Control smoothing with the standard deviation of the smoothing Gaussian kernel.
        Ignored without smoothing.
    aggregate_trans : bool
        Aggregate trans-expected at the inter-chromosomal level.
    expected_column_name : str
        Name of the column where to store combined expected
    ignore_diags : int, optional
        Number of intial diagonals to exclude for calculation of distance dependent
        expected.
    clr_weight_name : str or None
        Name of balancing weight column from the cooler to use.
        Use raw unbalanced data, when None.
    chunksize : int, optional
        Size of pixel table chunks to process
    nproc : int, optional
        How many processes to use for calculation
    Returns
    -------
    expected_df: pd.DataFrame
        cis and trans expected combined together
    F)	view_df
intra_onlysmoothsmooth_sigmaaggregate_smoothedclr_weight_nameignore_diags	chunksizenprocz!Done calculating cis expected in .3f sec ...)r    r%   r'   r(   distz#Done calculating trans expected in indexrcolumnsnamebalanced.avg	count.avgz	.smoothed.aggn_valid	count.sumbalanced.sumchromregion1region2sumzcount.avg.aggzbalanced.avg.aggT)ignore_indexr1r2z&Returning combined expected DataFrame.)timeperf_counterr	   logginginfor
   TRANS_DIST_VALUEreset_indexrename	set_indexcopyappendgroupbylocto_numpy	transformpdconcat)clrr    
smooth_cisr$   r#   aggregate_transexpected_column_namer&   r%   r'   r(   
time_startcvdtime_elapsedcpb
view_labelcis_expected_nametrans_expected_nameadditive_cols_cpb_aggexpected_dfs                        b/var/www/html/software/conda/lib/python3.11/site-packages/cooltools/sandbox/obs_over_exp_cooler.pyexpected_fullr]   S   s   z "$$J
!-'!  C $&&3LLO\OOOOPPP "$$J
'  C #CK$&&3LLQ|QQQQRRR ..6""  +:J{ ;0;;; 	;#4 : : : #$5 6 ; ; = =C -<L.. S!4:::";/ 	1  000;;7#'I7@@BB7#'I7@@BB
 

 
 #5)) 	  (4Xi5HHO 	S&.~&>x	?R&RC"# #$7 8 = = ? ?C )S#JT:::K #3+K	,BCLLNNK"3+K	,BCLLNNK L:;;;    c           	      p   t          t          |                                d                    \  }}d |D             }d |D             }t          | \  }}t	          j        |                              dg          }t	          j        |                              dg          }t          | |||          }t          | \  }}t	          j        |                              dg          }t	          j        |                              dg          }t          | |||          }g }	|ddg                                         D ]W\  }
}}|ddg                                         D ]2\  }}}||
k    r$||k    rz|||f         	                                }|
                    d	d
|           |
                    d	d|
           |	                    |                    dd
t          g                     ||k    rt	          j        |||f         d	g          }|
                    d	t          t                     |
                    d	d
|           |
                    d	d|
           |	                    |                    dd
t          g                     4Yt	          j        |	          S )z
    create a DataFrame for accumulating expected summaries (blocks and diagonal ones)
    it also contains "n_valid" column for dividing summaries by.
    r   c              3   D   K   | ]\  }}|j         |j         k    ||fV  d S Nr7   .0r<   r=   s      r\   	<genexpr>z/make_pairwise_expected_table.<locals>.<genexpr>   s8      LLfb"bh"(6J6J"b6J6J6J6JLLr^   c              3   D   K   | ]\  }}|j         |j         k    ||fV  d S ra   rb   rc   s      r\   re   z/make_pairwise_expected_table.<locals>.<genexpr>   s8      PPB"(bh:N:NB8:N:N:N:NPPr^   Indexr.   r%   r7   r0   r   r=   r<   )r,   )r   r   
itertuplesziprL   	DataFramedropr   r   rC   insertrG   rE   
_DIST_NAMErB   rM   )rN   r    r%   	cis_combstrans_combsregions1regions2dtablesbtables_tables_r1chrom1name1_r2chrom2name2dfs                    r\   make_pairwise_expected_tabler}      s    !%g&8&8&:&:A>> I{ ML	LLLIPP+PPPK iHh|H%%**G9*==H|H%%**G9*==HsHhXXXG k*Hh|H%%**G9*==H|H%%**G9*==HsHhXXXG G%wv&67BBDD K KVU")76*:";"F"F"H"H 	K 	KCs

FNN %0<<>>BIIas+++IIas+++NN2<<tZ0H#I#IJJJf$$guen&=aSIIIBIIa-=>>>IIas+++IIas+++NN2<<tZ0H#I#IJJJ	K  9gr^   c                    |\  }}|                                  dd         }t          ||          |d<   |                                 ||         }	t          j        |	|d          }	||	                    ddg          }	n |	                    dd|dz   |d	z   g          }	|	                    t          t          d
          }	|	d         |	d         k    }
t          |	j	        ddt          f<   |	j	        |
df         |	j	        |
df         z
  |	j	        |
t          f<   |                                D ]\  }} ||	          |	|<   |	                    ddt          g          }||                                                             d          S )a>  
    calculates summaries for every pixel block defined by pairwise
    combinations of the specified regions: calculates per-diagonal
    sums for intra-chromosomal blocks and overall sums for inter-
    chromosomal blocks.

    pixels in the blocks are labeled with regions' serial numbers,
    and groupby-s are used grouping.

    Trans-values have a special value for their diagonal/dist -1

    Return:
    dictionary of DataFrames with diagonal sums and overall sums
    for the "fields", indexed with the (i,j) combinations of serial
    numbers of the regions.
    Nr-   Freplacer<   r=   subset12)r<   r=   rw   rz   bin2_idbin1_id.sum)binsr   pixelscoolerannotatedropnaastypeintrB   rI   rn   itemsrH   r:   
add_suffix)rN   fields
transformsr%   regionsspanlohir   r   cis_maskfieldt_blockss                 r\   _sum_r     s   " FB88::aaa=Dg..DIZZ\\"R% F_VT5999F tTl33$# 57LM  
 

 ]]3//00F h6(#33H !1FJqqq*}'-z(I2E'FT\^gTgIh'hFJx#$$$&& " "q&		u nndD*566G 6?  ++F333r^   c           	      B   t          dt          |                                           |          }t          t	          dg|                    }d |D             }		 t          || dd          }
n"# t          $ r}t          d          |d}~ww xY wt          | ||          }|	D ]}d||<   t          t          | ||||                                          } |||          }t          d	 ||          }|rJt          |          D ]:}|                    |t          d
          j        }t"          j        |j        ||	f<   ;|                    t          d           |                    dt,          |j        |j                            d          df                                                    |                    dt0          |j        |j                            d          df                                                    |                    ddgdd           |S )aU  

    Intra-chromosomal diagonal summary statistics for asymmetric blocks of
    contact matrix defined as pairwise combinations of regions in "view_df.

    Note
    ----
    This is a special case of asymmetric diagonal summary statistic that is
    efficient and covers the most important practical case of inter-chromosomal
    arms "expected" calculation.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    view_df : viewframe
        view_df of regions for intra-chromosomal diagonal summation, has to
        be sorted according to the order of chromosomes in cooler.
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    clr_weight_name : str
        name of the balancing weight vector used to count "bad"
        pixels per diagonal. Set to `None` not to mask
        "bad" pixels (raw data only).
    chunksize : int, optional
        Size of pixel table chunks to process
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    Dataframe of diagonal statistics for all intra-chromosomal blocks defined as
    pairwise combinations of regions in the view

    r   countc                     g | ]}| d S )r    )rd   r   s     r\   
<listcomp>z sum_pairwise.<locals>.<listcomp>z  s    999nnn999r^   Tcheck_sortingraise_errorszprovided view_df is not validNrh   c                 0    |                      |d          S )Nr   )
fill_value)add)df1df2s     r\   <lambda>zsum_pairwise.<locals>.<lambda>  s    swwsqw'A'A r^   F)r   
drop_level)r   inplacer<   r0   r   r=   )r   rl   r   )r   lenr   listr   r   	Exception
ValueErrorr}   r   r   rJ   r   rangexsrn   r,   npnanrI   rC   rm   _REGION1_NAMEget_level_values_REGION2_NAME)rN   r    r   r%   r&   r'   mapspansr   summary_fields_e	exp_tableagg_namejobresults	result_df_d_idxs                      r\   sum_pairwiser   I  sK   \ aSZZ\\**I66E%	:..//F99&999NA#	
 
 
  A A A899q@A
 -S'?[[[I #    	( sFJ9I9I9K9K C c#uooG AA7IVVI  9%% 	9 	9B<<*<GGMD24&IM$.// 
D999Qw{9?3S3STX3Y3Y[a3a'b'k'k'm'mnnnQw{9?3S3STX3Y3Y[a3a'b'k'k'm'mnnnt4FFFs   A0 0
B:B

Bc           	         |t          |           }n7	 t          || dd          }n"# t          $ r}t          d          |d}~ww xY wt          t
          dd}|i }d|d<   d	|d
<   n@t          | |          r|dz   |dz   dfdi}d|d<   d|d
<   nt          d| d          |
dk    rt          j        |
          }|j	        }nt          }t          j                    }	 t          | |||||	|          }|
dk    r|                                 n # |
dk    r|                                 w w xY w||d                                      ||d                            ||d
         <   |                                                    ddi                              d          }|j        |d         df                                         |d<   |j        |d         df                                         |d<   |                    t+          |j                  |t.          j                   d||fv rT|j        |d         df                                         |d <   |j        |d         df                                         |d!<   |r#|dk    rd d!g}n&|d"k    rd}nt          d#          t2          t4          g}|t                   t6          k    }|t                   t6          k    }|d         |d         g}|r~t9          j        |j        |         |||$          }|d
         |d%         z   }|                    |t          |g         g |pg t          d&'          }|j        ||f         |j        ||f<   n|r|j        |                             g |pg t          d(          |                              d)          !                    d*          }||d          d*                             ||d          d*                   ||<   ||         |j        ||f<   n|j        ||d
         f         |j        ||f<   |r|dk    rQ|j        |                             d d!gd(          |                              d)          !                    d*          }nX|d"k    rC|j        |         }|j        ||f                              d)          !                    d*          }nt          d+          ||d          d*                             ||d          d*                   ||<   ||         |j        ||f<   n|j        ||d
         f         |j        ||f<   t          j                    |z
  }tE          j#        d,|d-d.           |S )/r   NTr   z0view_df is not a valid viewframe or incompatiblez.smooth)r+   n_pixelssmooth_suffixr5   
n_contactsr2   contact_freqr   r   balancedc                 8    | d         |          z  |          z  S )Nr   r   )pweight1weight2s    r\   r   z$expected_full_fast.<locals>.<lambda>  s    AgJ7,Caj,P r^   r6   r1   z+cooler is not balanced, orbalancing weight z  is not available in the cooler.r   )r    r   r%   r&   r'   r   r   r,   r-   r.   r0   r8   r<   r9   r=   )rI   columnvaluer7   rw   rz   genomez2aggregate_cis could be only chrom, genome or False)rH   sigma_log10colsr   left)onhow)observedr:   r3   z4aggregate_trans could be only chrom, genome or FalsezDone calculating full expected r)   r*   )$r   r   r   r   rn   _NUM_VALID_NAMEr   mpPoolr   r>   r?   r   closedividerC   rD   rE   rI   rJ   rm   r   r/   r   r   r   r   rB   r   agg_smooth_cvdmergerH   rK   r   r@   rA   ) rN   r    rO   aggregate_cisr#   rP   rQ   r&   r%   r'   r(   r   r   r   r   poolmap_rR   resultrV   grp_columns	_cis_mask_trans_maskrY   
_smooth_df_smooth_col_name_agg_df_trans_agg_df	_trans_dfrT   r   r   s                                  @@r\   expected_full_fastr     s   x &s++GG		X'"!  AA  	X 	X 	XOPPVWW	X
 #" D
 
(\*^	C	1	1 
!C'!C' "P"P"P"P"PQ
+\-^R /R R R
 
 	
 qyywu~~x "$$J!+%
 
 
 199JJLLL 199JJLLLL  $*$|*<#=#D#DtJ $ $F4 
 ..6""  >&"3S"89BBDDF4L>&"3S"89BBDDF4L MMc&.))2FbfMUUU ?M222%>&*;W*DENNPPx%>&*;W*DENNPPx 5G###X.KKh&&KKQRRR$m4 z"&66I$(88K :&\(:<M  b'6Jy!$	
 
 

  /$2GG%5781+#1j1  
 
 7=jL\A\6]
92233	 b*Y'W7)r7J7$WGGWYuZ 	
 )0T,5G0O0O0O(P(W(Wj)1112)
 )
$% 7>>R6S
92233 7=jDQ_L`A`6a
9223  fg%%"J{3(H-==mM5!!F## M ((
;/I"J{M'AB5!!F## M STTT.;lAS<[<[<[.\.c.ctJ/7778/
 /
*+ 9FFZ8[
; 4455 9?
;PTUcPdCd8e
; 445 $&&3LLM<MMMMNNNMs    * 
A	AA	(D D6r-   oec                 Z   |rd}| d}| d}	nd}| d}
| d}t          j        | |d          }|r8|d         ||         z  ||	         z  ||<   |                    |
|
||	g          }n|                    |
|
g          }|                    |
t          |
t          i          }|d         |d	         z
  |d
<   |d         |d         k    }|d
                             |t                    |d
<   |                    ||
|d
|g         d|
|d
g          }||         ||         z  ||<   |S )a  
    A function that returns pixel table with observed over expected.
    It takes a DataFrame of pixels (complete or a chunk, in pandas or dask formats),
    and merges it with the `expected_full` DataFrame, in order to assign appropriate
    expected for each pixels. This assignment is done according to the pixels' location,
    specifically - which combination of regions in the view and genomic distance for
    cis-pixels.

    Parameters
    ----------
    pixels: pd.DataFrame | dask.DataFrame
        DataFrame of pixels
    bins : pd.DataFrame
        A bin table with a view column annotation.
    expected_full : pd.DataFrame
        DataFrame expected for all regions in the view used for annotation of bins.
    view_column_name : str
        Name of the column with the view annotations in `bins` and `expected_full`
    expected_column_name : str
        Name of the column with the expected values in `expected_full`
    clr_weight_name : str or None
        Name of balancing weight column from the cooler to use.
        Use raw unbalanced data, when None.
    oe_column_name : str
        Name of the column to store observed over expected in.

    Returns
    -------
    pixels_oe : pd.DataFrame | dask.DataFrame
        DataFrame of pixels with observed/expected
    r   r   r   r   Fr   r   r   r   r+   rw   rz   r   )r   r   )r   r   r   r   r   whererB   r   )r   r   r]   view_column_namerQ   r%   oe_column_nameobserved_column_nameweight_col1weight_col2	view_col1	view_col2	pixels_oer   s                 r\   obs_over_expr     s   T  ')(+++(+++& $&&&I#&&&I e<<<I  F*3G*<y?U*UXabmXn*n	&'$$iKQ\-]$__		$$i-C$EE	   )c9s!CDDI "),y/CCIf"i&99H!&)//:JKKIf y)V5IJKy&)    I !**> ?)L`Ba aInr^   r   @B c           
   #     K   |"t          |                                           }n|                                }d}|                                 dd         }	t          |	|          |	|<   t	          dt          |                                           |          }
|
D ]D}|\  }}t          |                                 ||         |	|||||          }|dd|g         V  EdS )an  
    Generator yielding chunks of pixels with
    pre-caluclated observed over expected.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    expected_full : pd.DataFrame
        DataFrame expected for all pairwise combinations of regions in view_df
    view_df : viewframe
        viewframe of regions that were used to calculate expected_full
    expected_column_name : str
        Name of the column with the combined expected
    oe_column_name : str
        Name of the column to store observed over expected
    clr_weight_name : str or None
        Name of balancing weight column from the cooler to use.
        Use raw unbalanced data, when None.
    chunksize : int, optional
        Size of pixel table chunks to process and output

    Yields
    ------
    pixel_df: pd.DataFrame
        chunks of pixels with observed over expected
    Nr-   r   )r   rQ   r%   r   r   r   )r   rJ   r   r   r   r   r   r   )rN   r]   r    rQ   r%   r   r'   
view_arrayr   	bins_viewr   r   r   r   oe_chunks                  r\   obs_over_exp_generatorr     s     J %c**3355

%%''
 

111I"1)Z"H"HI aSZZ\\**I66E ? ?BJJLLB-!5+)
 
 
 	9n=>>>>>? ?r^   )
NFFr   Fr   r   r   r   r   )r-   r   r   r   )Nr   r   r   r   )4__doc__r>   r@   numpyr   pandasrL   multiprocessr   r   cooler.utilr   	itertoolsr   r   r   	functoolsr   r   	cooltoolsr	   r
   cooltools.lib.commonr   r   cooltools.libr   r   cooltools.api.expectedr   r   cooltools.lib.schemasr   r   cooltools.sandboxr   r   r   r   rn   r   rB   basicConfigINFOr]   r}   r   r   r   r   r   r   r   r^   r\   <module>r     s   <                ! ! ! ! ! !          & % % % % % % %              
       
 F E E E E E E E       
 1 0 0 0 0 0 )**1-)**1-T&''*
$+,,Q/   ', ' ' ' '
  ' I I I IX1  1  1 h34 34 34r _ _ _ _H ' d d d dV #T T T Tt #?? ?? ?? ?? ?? ??r^   