
    DUfb                     L   d dl Z d dlZ ej        ej                   d dlZd dlmZ d dlZd dl	Z
d dlZd dlmZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ ddZ	 	 	 	 ddZeddd ddddddef
d            Zd Z	 	 	 	 	 d dZd!dZ 	 	 	 	 	 d"dZ!	 	 	 	 	 	 	 	 	 	 d#dZ"dS )$    N)level)partial)threshold_lithreshold_otsu   )CSRSelector)peaksnumutils)is_compatible_viewframeis_cooler_balanced)make_cooler_viewpool_decorator
   c           	         t          |           }t          j        |          }t          j        |t                    }t	          d|          D ]}t	          d|          D ]}||z   |k     rd|dd<   |dk    r|| z  }n||dxx         | d|          z  cc<   |dk    r|| z  }n|d| xx         | |d         z  cc<   |||r| ndxx         d|||r| nd         z
  z  cc<   |S )zJ
    Calculate the number of "good" pixels in a diamond at each bin.

    dtyper   FN   )lennpzerosboolrange)bad_bin_maskwindowignore_diagsNn_pixelsloc_bad_bin_maski_shiftj_shifts           U/var/www/html/software/conda/lib/python3.11/site-packages/cooltools/api/insulation.pyget_n_pixelsr"      sd   
 	LAx{{Hx...F##  Q'' 	 	G <//"'QQQ!|| L0   ***l9WH9.EE***!|| L0   7(+++|GHH/EE+++WG =>???$WG0M%NOO????	" O    Tweightc           
      |   |j                                         }|j                                         dz   }||z
  }t          j        |          }	t          j        |          }
|3t          t          j        dt          |                    ||          }n?t          ||                                         j	        ||          }|dz   |dz   fd}| 
                                D ]d}t          j        |g d	          }||j        |j        z
  |dz
  d
z  k             }|rJt          j        |||g                   } ||          |d<   |d                                         j	         }|j        j	        |z
  }|j        j	        |z
  }t#          d|          D ]}t#          d|          D ]}||z   |k     r||z   ||z
  k    ||z   |k     z  ||z
  dk    z  }|	t          j        ||         |z   |d         j	        |         |          z  }	|r:|
t          j        |||z           |z   |d         j	        ||z           |          z  }
ft'          j                    5  t'          j        d           |r|
|z  }n|	|z  }|r|t          j        |          z  }ddd           n# 1 swxY w Y   |||
|	fS )a1  
    Calculates the insulation score of a Hi-C interaction matrix.

    Parameters
    ----------
    pixel_query : RangeQuery object <TODO:update description>
        A table of Hi-C interactions. Must follow the Cooler columnar format:
        bin1_id, bin2_id, count, balanced (optional)).
    bins : pandas.DataFrame
        A table of bins, is used to determine the span of the matrix
        and the locations of bad bins.
    window : int
        The width (in bins) of the diamond window to calculate the insulation
        score.
    ignore_diags : int
        If > 0, the interactions at separations < `ignore_diags` are ignored
        when calculating the insulation score. Typically, a few first diagonals
        of the Hi-C map should be ignored due to contamination with Hi-C
        artifacts.
    norm_by_median : bool
        If True, normalize the insulation score by its NaN-median.
    clr_weight_name : str or None
        Name of balancing weight column from the cooler to use.
        Using raw unbalanced data is not supported for insulation.
    r   NF)r   r   12c                 8    | d         |          z  |          z  S )Ncount )pweight1weight2s    r!   <lambda>zinsul_diamond.<locals>.<lambda>h   s    aj1W:5'
B r#   )bin1_idbin2_idr)   )columnsr   balancedr   r)   )	minlengthignore)indexminmaxr   r   r"   repeatr   isnullvaluesread_chunkedpd	DataFramer0   r/   coolerannotater   bincountwarningscatch_warningssimplefilter	nanmedian)pixel_querybinsr   r   norm_by_medianclr_weight_name	lo_bin_id	hi_bin_idr   
sum_countssum_balancedr   	transform
chunk_dictchunkdiag_pixelsvalid_pixel_maskijr   r    maskscorer,   r-   s                          @@r!   insul_diamondrV   2   s   B 
  I
  1$IIA!J8A;;LIeSYY''\
 
 

  !((**1%
 
 
 "C'!C'BBBBB	!..00    
Z1P1P1PQQQEMEM9fqjA=MMN 	H /+t_<M7NOOK&/i&<&<K
# +J 7 > > @ @ GG&2&2Q'' 	 	G F++  W$|33 [AK/7{Q(7{a')  bkdGg%{7';'B4'HTU   
 #  BK$!112W<#J/6t>N7NO"#% % % L	, 
	 	"	" 	) 	)h''' 	* 8+EE)E 	)R\%(((E	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) (L*44s   &;J--J14J1
is_bad_binF -1r   c                    |t          |           }n7	 t          || dd          }n"# t          $ r}t          d          |d}~ww xY w|r:	 t	          | |d          }n&# t          $ r}t          d| d          |d}~ww xY w| j        d         }|R	 |                     | j                            d	          d
| z             d         }n?#  t          d|           xY wt          |t                    rnt          d|           t          j        |          r|g}t          j        |t                    }||z  dk    }t          j        |          rt          d||          d|           t          t           | ||||||||	
  
        } |||g d         j                  }t%          j        |          }|S )a  Calculate the diamond insulation scores for all bins in a cooler.

    Parameters
    ----------
    clr : cooler.Cooler
        A cooler with balanced Hi-C data.
    window_bp : int or list of integers
        The size of the sliding diamond window used to calculate the insulation
        score. If a list is provided, then a insulation score if calculated for each
        value of window_bp.
    view_df : bioframe.viewframe or None
        Viewframe for independent calculation of insulation scores for regions
    ignore_diags : int | None
        The number of diagonals to ignore. If None, equals the number of
        diagonals ignored during IC balancing.
    min_dist_bad_bin : int
        The minimal allowed distance to a bad bin to report insulation score.
        Fills bins that have a bad bin closer than this distance by nans.
    is_bad_bin_key : str
        Name of the output column to store bad bins
    append_raw_scores : bool
        If True, append columns with raw scores (sum_counts, sum_balanced, n_pixels)
        to the output table.
    clr_weight_name : str or None
        Name of the column in the bin table with weight.
        Using unbalanced data with `None` will avoid masking "bad" pixels.
    verbose : bool
        If True, report real-time progress.
    nproc : int, optional
        How many processes to use for calculation. Ignored if map_functor is passed.
    map_functor : callable, optional
        Map function to dispatch the matrix chunks to workers.
        If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
        If nproc=1 this defaults the builtin map. 

    Returns
    -------
    ins_table : pandas.DataFrame
        A table containing the insulation scores of the genomic bins
    NTcheck_sortingraise_errors0view_df is not a valid viewframe or incompatibler\   #provided cooler is not balanced or  is missingbin-size//bins/r   zEignore_diags not provided, and not found in cooler balancing weights z&ignore_diags must be int or None, got r   r   zThe window sizes z& has to be a multiple of the bin size chromstartendname)r   r   	Exception
ValueErrorr   info_load_attrsrootrstrip
isinstanceintr   isscalararrayanyr   _get_region_insulationr:   r<   concat)clr	window_bpview_dfr   min_dist_bad_binis_bad_bin_keyappend_raw_scores	chunksizerH   verbosenprocmap_functor_ebin_sizebad_win_sizesjobins_region_tables	ins_tables                      r!   calculate_insulation_scorer      sk   p "3''	X'"!	  AA  	X 	X 	XOPPVWW	X  	"3dKKKAA 	 	 	RoRRR 	
 x
#H	??$$'A'A'AA LL	iXgii   
L#	&	& RP,PPQQQ	{9  K	#...I(A-M	vm 
j	- 8jj`hjj
 
 	
  C $C1R1R1R)S)Z[[	+,,Is8   ( 
AAAA   
B*A>>B9C C$c
           	         t          |                                           }
t          |                     d          |
|
fd|          }| j        d         }||z  }|	\  }}}}|||g}	|                                                     |	          }|g d                                         }||j        dddf<   |r||                                         nd||<   |rt          j        d	|            |r.|
                    t          j        ||                   
          }|                     |	          \  }}|||||f         }t          |          D ]\  }}t          j                    5  t          j        dt$                     t'          |||||          \  }}}}t(          j        ||dk    <   t)          j        |          }ddd           n# 1 swxY w Y   t(          j        |t)          j        |           <   ||d||          <   ||d||          <   |r/|j        j        |k     }t(          j        |j        |d||          f<   |r||d||          <   ||d||          <   |S )zH
    Auxilary function to make calculate_insulation_score parallel.
    rr)   )shapefieldr|   ra   re   rf   rg   NregionFzProcessing region dist_bad_binr4   )r   r   rH   r   log2_insulation_score_n_valid_pixels_sum_counts_sum_balanced_)r   rF   r   openrk   fetchcopylocr9   loggingassignr
   dist_to_maskextent	enumeraterA   rB   rC   RuntimeWarningrV   r   nanlog2isfiniter   r:   )rv   rz   rH   r|   rw   ry   r   r{   r}   r   nbinsselectorr   window_binsre   rf   rg   rh   region_bins
ins_regionc0c1region_queryrS   win_bin	ins_trackr   rL   rK   mask_bads                                 r!   rt   rt     s   " 

OOEeU^7i  H
 x
#Hx'K %E5#tUC F((**""6**K6667<<>>J"&JN111h;1@KO$++---e ~  20$00111 
&&!.z./IJJ ' 
 


 ZZFBBrE2b5L)L,, F F
7$&& 	+ 	+!(N;;;<I) /= = =9Ixz )+Ii1n%	**I	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ .0V	2;y)))*>G
:IaL::;7?
3Yq\334 	W!.58HHHPRPVJN8%Lil%L%LLM 	F7AJ3Yq\3349EJ5y|556s   1AGG	G	Q?log2_insulation_score_{WINDOW}n_valid_pixels_{WINDOW}c                     |r3t          j        fd                     d          D                        d|v r~t                      } j        D ]g}t          j        |                    d          |          }|r:|                    t          |
                                d                              hnt          dg          } fd|D             }	g }
 j        j        } j        j        }d	 j        _                             d
d                                d          D ] \  }}|                    dg          }|D ]}|                    |                   j        |	|         k    }|r||j        j        |k    z  }||                    |                   j        |         }t#          j        |           \  }}t'          j        |          t&          j        z  }|||<   |d| }nd}t&          j        ||<   ||j        ||f<   |
                    |           t          j        |
          }|                    d	          }||j        _        |j        |ddf         }|S )aK  Call insulating boundaries.

    Find all local minima of the log2(insulation score) and calculate their
    chromosome-wide topographic prominence.

    Parameters
    ----------
    ins_table : pandas.DataFrame
        A bin table with columns containing log2(insulation score),
        annotation of regions (required),
        the number of valid pixels per diamond and (optionally) the mask
        of bad bins. Normally, this should be an output of calculate_insulation_score.
    view_df : bioframe.viewframe or None
        Viewframe for independent boundary calls for regions
    min_frac_valid_pixels : float
        The minimal fraction of valid pixels in a diamond to be used in
        boundary picking and prominence calculation.
    min_dist_bad_bin : int
        The minimal allowed distance to a bad bin to be used in boundary picking.
        Ignore bins that have a bad bin closer than this distance.
    log2_ins_key, n_valid_pixels_key : str
        The names of the columns containing log2_insulation_score and
        the number of valid pixels per diamond. When a template
        containing `{WINDOW}` is provided, the calculation is repeated
        for all pairs of columns matching the template.

    Returns
    -------
    ins_table : pandas.DataFrame
        A bin table with appended columns with boundary prominences.
    c                 r    g | ]3\  }}|                     t          j        |                              4S )r   )r   r
   r   ).0r   dfrz   s      r!   
<listcomp>z#find_boundaries.<locals>.<listcomp>  sK       FB 		x'<R=O'P'P	QQ  r#   r   z{WINDOW}z(\d+)WINDOWr   Nc                 t    i | ]4}|                     |                                                    z  5S )r   )formatr7   )r   winr   min_frac_valid_pixelsn_valid_pixels_keys     r!   
<dictcomp>z#find_boundaries.<locals>.<dictcomp>  sX         	Y)000<<=AACC
   r#   sorting_indexFT)dropinplacerf   boundary_strength_boundary_strength)r<   ru   groupbysetr1   rematchr   addrp   groupsr5   rh   r:   reset_indexsort_valuesr   r	   find_peak_prominencer   
zeros_liker   r   append	set_index)r   r   ry   log2_ins_keyr   rz   windowscolmmin_valid_pixelsdfs
index_namesorting_orderr   r   r   rT   r   posspromsins_prom_trackbs_keys   ``  ``                r!   find_boundariesr   e  s   P  
I   "+"3"3H"="=  
 
	 \!!%%$ 	0 	0C,,H,==sCCA 0C

1..///	0
 tf++         C%JO*M*IOud333''11  
^^WI&& 	2 	2C%,,C,889@#C() 
   C.2BBB<..c.::;B4HI4iZ@@KD%]955>N#(N4 3c33,BvJ#1BF4<  

2	3B	o	&	&BBHM	qqq 	!BIr#   c                 p   |rK|                                  } t          | dz   |          D ]"}t          j        | t          j        |           #t          j                    5  t          j        d           | j	        d         }t          j        t	          j
        |          z  }t          d|          D ]Q}t          d|dz   |z
            }t          ||z   |          }t	          j        | ||dz   ||f                   ||<   R|r|t	          j        |          z  }ddd           n# 1 swxY w Y   |S )a>  
    Calculates the insulation score of a Hi-C interaction matrix.

    Parameters
    ----------
    mat : numpy.array
        A dense square matrix of Hi-C interaction frequencies.
        May contain nans, e.g. in rows/columns excluded from the analysis.

    window : int
        The width of the window to calculate the insulation score.

    ignore_diags : int
        If > 0, the interactions at separations < `ignore_diags` are ignored
        when calculating the insulation score. Typically, a few first diagonals
        of the Hi-C map should be ignored due to contamination with Hi-C
        artifacts.

    norm_by_median : bool
        If True, normalize the insulation score by its NaN-median.

    Returns
    -------
    score : ndarray
        an array with normalized insulation scores for provided matrix
    r   r4   r   N)r   r   r
   set_diagr   r   rA   rB   rC   r   onesr7   r6   nanmeanrD   )	matr   r   rG   rR   r   rU   lohis	            r!   _insul_diamond_denser     sv   6  .hhjj}q(,77 	. 	.Ac261----		 	"	" ) )h'''IaL#q! 	9 	9AQA''BQZ##Bz#b1q5j!B$&6"788E!HH 	)R\%(((E) ) ) ) ) ) ) ) ) ) ) ) ) ) ) Ls   !B>D++D/2D/順 c                 
   |t          |           }n7	 t          || dd          }n"# t          $ r}t          d          |d}~ww xY w| j        d         }|r:	 t          | |d          }n&# t          $ r}t          d| d          |d}~ww xY w|:|                     | j                            d	          d
| z             d         }n)t          |t                    rnt          d| d          ||z  }	||z  dk    rt          d| d|           g }
|g d         j        D ]A\  }}}}|||g}|                                                     |          g d         }t          j        |                                                     |          |         j                  }|                     |                              |          }t#          j                    5  t#          j        dt(                     t+          ||	|          }t          j        ||dk    <   t          j        |          }ddd           n# 1 swxY w Y   t          j        |          }t3          d|          D ]U}|dk    r||z  }|t          j        dg|z  |d|          f         z  }|t          j        ||d         dg|z  f         z  }Vt          j        ||<   ||d<   t          j        |t          j        |           <   ||d| <   t9          j        |           \  }}t          j        |          t          j        z  }|||<   ||d| <   ||d| <   |
                    |           Ct?          j         |
          }|S )a.  Calculate the diamond insulation scores and call insulating boundaries.

    Parameters
    ----------
    clr : cooler.Cooler
        A cooler with balanced Hi-C data. Balancing weights are required
        for the detection of bad_bins.
    window_bp : int
        The size of the sliding diamond window used to calculate the insulation
        score.
    view_df : bioframe.viewframe or None
        Viewframe for independent calculation of insulation scores for regions
    clr_weight_name : str
        Name of the column in bin table that stores the balancing weights.
    min_dist_bad_bin : int
        The minimal allowed distance to a bad bin. Do not calculate insulation
        scores for bins having a bad bin closer than this distance.
    ignore_diags : int
        The number of diagonals to ignore. If None, equals the number of
        diagonals ignored during IC balancing.

    Returns
    -------
    ins_table : pandas.DataFrame
        A table containing the insulation scores of the genomic bins and
        the insulating boundary strengths.
    NTrZ   r]   ra   r^   r_   r`   rb   rc   r   zprovided ignore_diags z is not int or Noner   zThe window size (z') has to be a multiple of the bin size rd   r   )balancer4   bad_bin_maskedr   r   )!r   r   ri   rj   rk   r   rl   rm   rn   ro   rp   r:   rF   r   r   isnanmatrixrA   rB   rC   r   r   r   r   r   r   r_r   r	   r   r   r<   ru   )rv   rw   rx   rH   ry   r   r   r   r   r   r   re   rf   rg   rh   r   r   rW   r   r   bad_bin_neighborrR   r   r   r   r   s                             r!   !_find_insulating_boundaries_denser     s`   H "3''	X'"!	  AA  	X 	X 	XOPPVWW	X x
#H  	"3dKKKAA 	 	 	RoRRR 	 HOOC  #=O#=#==
 

 
L#	&	& US,SSSTTTx'K8q  \	\\RZ\\
 
 	
 #*+L+L+L#M#T "- "-uc4$XXZZ%%f--.G.G.GH
Xchhjj..v66GNOO
JJJ//55f==$&& 	+ 	+!(N;;;,Q\JJI(*Ii1n%	**I		+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ =44q*++ 	X 	XAAvv#3j#@  #3beTFQJ
SVUVTVSV<W6X#X #3beJqrrNTFUVJ<V6W#W  &(f	"#'7
#$-/V	2;y)))*;D
7I7780)<<ey11BF:$t7E
3	3347E
3	334  ,,,,	+,,Is?   ( 
AAAA- -
B7BB(AII	I	Lic                    |t          |           }n7	 t          || dd          }n"# t          $ r}t          d          |d}~ww xY w|dk    rd }n=|dk    rd }n3	 t	          |          fd	}n# t          $ r t          d
          w xY wt          | |||||||	|
|
  
        }t          |||          }|D ]#} ||d|          j                  }||d| <   $|S )a  Find insulating boundaries in a contact map via the diamond insulation score.

    For a given cooler, this function (a) calculates the diamond insulation score track,
    (b) detects all insulating boundaries, and (c) removes weak boundaries via an automated
    thresholding algorithm.

    Parameters
    ----------
    clr : cooler.Cooler
        A cooler with balanced Hi-C data.
    window_bp : int or list of integers
        The size of the sliding diamond window used to calculate the insulation
        score. If a list is provided, then a insulation score if done for each
        value of window_bp.
    view_df : bioframe.viewframe or None
        Viewframe for independent calculation of insulation scores for regions
    ignore_diags : int | None
        The number of diagonals to ignore. If None, equals the number of
        diagonals ignored during IC balancing.
    clr_weight_name : str
        Name of the column in the bin table with weight
    min_frac_valid_pixels : float
        The minimal fraction of valid pixels in a diamond to be used in
        boundary picking and prominence calculation.
    min_dist_bad_bin : int
        The minimal allowed distance to a bad bin to report insulation score.
        Fills bins that have a bad bin closer than this distance by nans.
    threshold : "Li", "Otsu" or float
        Rule used to threshold the histogram of boundary strengths to exclude weak
        boundaries. "Li" or "Otsu" use corresponding methods from skimage.thresholding.
        Providing a float value will filter by a fixed threshold
    append_raw_scores : bool
        If True, append columns with raw scores (sum_counts, sum_balanced, n_pixels)
        to the output table.
    verbose : bool
        If True, report real-time progress.
    nproc : int, optional
        How many processes to use for calculation

    Returns
    -------
    ins_table : pandas.DataFrame
        A table containing the insulation scores of the genomic bins
    NTrZ   r]   r   c                 (    | t          |           k    S N)r   xs    r!   r.   zinsulation.<locals>.<lambda>  s    a<??&: r#   Otsuc                 (    | t          |           k    S r   )r   r   s    r!   r.   zinsulation.<locals>.<lambda>  s    a>!+<+<&< r#   c                     | k    S r   r*   )r   thrs    r!   r.   zinsulation.<locals>.<lambda>  s    !s( r#   zAInsulating boundary strength threshold can be Li, Otsu or a float)	rx   rw   r   ry   r{   rH   r|   r}   r~   )r   ry   r   is_boundary_)r   r   ri   rj   floatr   r   r:   )rv   rw   rx   r   rH   r   ry   	thresholdr{   r|   r}   r~   r   r   thresholding_funcr   r   strong_boundariesr   s                     @r!   
insulationr   p  s   v "3''
	X' #!  AA  	X 	X 	XOPPVWW	X D::	f		<<		""C 2 2 2 2 	 	 	S  	
 +!)+'  I  3)  I
  < <--030018
 
 +<	&&&''s    ) 
AAA A5 5B)r   r   )r   r   Tr$   )r   r   r   r   rW   )r   r   T)r   Nr$   r   N)
NNr$   r   r   r   FrX   Fr   )#r   r   basicConfigINFOrA   	functoolsr   numpyr   pandasr<   r>   skimage.filtersr   r   
lib._queryr   libr	   r
   
lib.checksr   r   
lib.commonr   r   r"   rV   mapr   rt   r   r   r   r   r*   r#   r!   <module>r     s   				   ', ' ' ' '                 8 8 8 8 8 8 8 8 $ $ $ $ $ $ ! ! ! ! ! ! ! ! D D D D D D D D 9 9 9 9 9 9 9 9   @ e5 e5 e5 e5N  
z z z zzK K K` 10b b b bJ, , , ,b t t t tt 
r r r r r rr#   