o
    Nrf                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddl	mZmZ e ejd	ed
dejdddddejdddddejddedddejddedejddedddejddedddejdd ed ddejd!d"ejdd#dejd$d%d&ed'ddejd(d)d*eed+ddejd,d-ed.ddejd/d0ed1ddejd2d3ed4ddejd5d6d7dddejd8d9dddejd:d;dddejd<d=eg d>d?ddd@dA ZdS )B    N)Pool   )ice)Cooler)bedsliceparse_cooler_uri   )cli
get_loggercool_uriZ	COOL_PATH)typemetavarz
--cis-onlyzMCalculate weights against intra-chromosomal data only instead of genome-wide.TF)helpZis_flagdefaultz--trans-onlyzMCalculate weights against inter-chromosomal data only instead of genome-wide.z--ignore-diagszNumber of diagonals of the contact matrix to ignore, including the main diagonal. Examples: 0 ignores nothing, 1 ignores the main diagonal, 2 ignores diagonals (-1, 0, 1), etc.)r   r   r   Zshow_defaultz--ignore-distzDistance from the diagonal in bp to ignore. The maximum of the corresponding number of diagonals and `--ignore-diags` will be used.)r   r   z	--mad-maxzIgnore bins from the contact matrix using the 'MAD-max' filter: bins whose log marginal sum is less than ``mad-max`` median absolute deviations below the median log marginal sum of all the bins in the same chromosome.   z	--min-nnzz_Ignore bins from the contact matrix whose marginal number of nonzeros is less than this number.
   z--min-countzRIgnore bins from the contact matrix whose marginal count is less than this number.z--blacklistzPath to a 3-column BED file containing genomic regions to mask out during the balancing procedure, e.g. sequence gaps or regions of poor mappability.)existsz--nprocz-pz.Number of processes to split the work between.   z--chunksizez-czFControl the number of pixels handled by each worker process at a time.g    cAz--tolzKThreshold value of variance of the marginals for the algorithm to converge.gh㈵>z--max-iterszGMaximum number of iterations to perform if convergence is not achieved.   z--namezName of column to write to.weightz--forcez-fz=Overwrite the target dataset, 'weight', if it already exists.z--checkz4Check whether a data column 'weight' already exists.z--stdoutz8Print weight column to stdout instead of saving to file.z--convergence-policyah  What to do with weights when balancing doesn't converge in max_iters. 'store_final': Store the final result, regardless of whether the iterations converge to the specified tolerance; 'store_nan': Store a vector of NaN values to indicate that the matrix failed to converge; 'discard': Store nothing and exit gracefully; 'error': Abort with non-zero exit status.)store_final	store_nandiscarderrorr   c           &      C   s  t t}t| \}}|rMt|d3}|| }||d vr.t| d| d td nt| d| d td W d	   n1 sHw   Y  |	rV|
rVt	d
t|d.}|| }||d v r|s|s~t
d| dd tjd td n|d |= W d	   n1 sw   Y  |d|  d t| }|d	urdd	l}t|%}tj|d| |drdnd	g dg ddtid}W d	   n1 sw   Y  | d	d	 jddd}|j}g }| D ]\}}t|||j|j|jf} || jj  qt!"|}nd	}|d	ur!t#|t$t!%||j& }z/|dkr/t'|}!|!j(}"nt)}"t*j+|||	|
|||||||dd|"d\}#}$W |dkrP|!,  n|dkr[|!,  w w t!-|$d s|.d |dkrt|.d n1|d kr|.d! t!j/|#d	d	< n|d"kr|.d# td n|d$kr|.d# td |rt0|#j1tj2ddd%d&d' d	S t|d)}|| }d(d)d*}%|d j3|fd+|#i|% |d | j45|$ W d	   d	S 1 sw   Y  d	S ),z
    Out-of-core matrix balancing.

    Matrix must be symmetric. See the help for various filtering options to
    mask out poorly mapped bins.

    COOL_PATH : Path to a COOL file.

    rbinsz: No 'z' column found.r   z::z is balanced.r   Nz8Provide at most one of --cis-only and --trans-only flagszr+'z' column already exists. z Use --force option to overwrite.)filezBalancing ""	i   )r   r   r   )chromstartendr    )sepheaderZusecolsnamesZdtypeT)ZobservedF)	chunksizecis_only
trans_onlytolmin_nnz	min_count	blacklistmad_max	max_itersignore_diagsZrescale_marginalsZuse_lockmapZ	convergedz+Iteration limit reached without convergencer   z6Storing final result. Check log to assess convergence.r   zSaving weights as NaN.r   zDiscarding result and aborting.r    z%g)r$   indexZna_repZfloat_formatgzip   )compressionZcompression_optsdata)6r
   __name__r   h5pyFileclickZechosysexitZ
UsageErrorprintstderrinfor   csvopenpdZread_csvSniffer
has_headerreadstrr   groupby
chromsizesZiterrowsr   r    r!   r"   appendr2   valuesnpZconcatenatemaxintceilZbinsizer   Zimap_unorderedr0   r   Ziterative_correctioncloseallr   nanZSeriesZ	to_stringstdoutZcreate_datasetattrsupdate)&r   Znprocr&   r-   r*   r+   r,   r/   r)   r'   r(   r.   nameforcecheckrR   Zconvergence_policyZignore_distloggerZ	cool_pathZ
group_pathZh5grpZclrr@   fZbad_regionsZbins_groupedrH   Zbad_bins_regresultpoolZmap_ZbiasstatsZh5opts r`   [/var/www/html/software/conda/envs/catlas/lib/python3.10/site-packages/cooler/cli/balance.pybalance   s    #
	



	
















$rb   )r;   r:   r8   numpyrK   ZpandasrB   Zmultiprocessr   r1   r   apir   utilr   r   r	   r
   commandargumentrF   optionrM   PathfloatZChoicerb   r`   r`   r`   ra   <module>   s    	

2