
    DUf                        d dl Zd dlZddlmZmZmZ ddlm	Z	m
Z
 ddlmZ g dZd"dZd"dZd"d	Zd"d
Zd#dZd$dZ ej        ej                   ej                     ej        ej                   ej                     ej        ej                   ej                     ej        ej                   ej                     ej        ej                   ej                     ej        ej                   ej                      ej        ej!                   ej"                     ej        ej#                   ej$                    iZ%d Z&	 	 	 	 	 	 	 	 	 	 d%dZ'	 	 	 	 	 	 d&dZ(d'dZ)	 	 	 	 d(dZ*	 	 	 	 	 	 	 	 	 d)dZ+	 	 	 	 	 	 	 	 	 	 	 	 	 	 d*dZ,	 	 	 	 d+dZ-d,dZ.	 	 	 	 	 d-dZ/	 	 	 	 	 	 d.dZ0d/dZ1	 	 	 	 	 	 d0dZ2	 	 	 	 	 d1d!Z3dS )2    N   )arropschecksconstruction)_get_default_colnames_verify_columns)parse_region)selectselect_maskselect_indicesselect_labelsexpandoverlapclustermergecoverageclosestsubtractsetdiffcount_overlapstrim
complementsort_bedframeassign_viewc                    |t                      n|\  }}}t          | |||g           t          |          \  }}}|t          d          || |         |k    }	nT|t          j        }| |         |k    | |         |k     | |         |k    z  | |         | |         k    | |         |k    z  z  z  }	|	                                S )a&  
    Return boolean mask for all genomic intervals that overlap a query range.

    Parameters
    ----------
    df : pandas.DataFrame

    region : str or tuple
        The genomic region to select from the dataframe in UCSC-style genomic
        region string, or triple (chrom, start, end).

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    Returns
    -------
    Boolean array of shape (len(df),)
    Nz*no chromosome detected, check region input)r   r   r	   
ValueErrornpinfto_numpy)
dfregioncolsckskekchromstartendmasks
             I/var/www/html/software/conda/lib/python3.11/site-packages/bioframe/ops.pyr   r      s    ( -1L&(((dJBBBR%%%$V,,E5#}EFFF}"v;&C2%fslr"v~.2"R& RVu_57
 ==??    c                 T    t          j        t          | ||                    d         S )a  
    Return integer indices of all genomic intervals that overlap a query range.

    Parameters
    ----------
    df : pandas.DataFrame

    region : str or tuple
        The genomic region to select from the dataframe in UCSC-style genomic
        region string, or triple (chrom, start, end).

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    Returns
    -------
    1D array of int
    r   )r   nonzeror   r    r!   r"   s      r*   r   r   E   s%    ( :k"fd3344Q77r+   c                 :    | j         t          | ||                   S )a  
    Return pandas Index labels of all genomic intervals that overlap a query
    range.

    Parameters
    ----------
    df : pandas.DataFrame

    region : str or tuple
        The genomic region to select from the dataframe in UCSC-style genomic
        region string, or triple (chrom, start, end).

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    Returns
    -------
    pandas.Index
    )indexr   r.   s      r*   r   r   \   s    * 8KFD1122r+   c                 :    | j         t          | ||                   S )a  
    Return all genomic intervals in a dataframe that overlap a genomic region.

    Parameters
    ----------
    df : pandas.DataFrame

    region : str or tuple
        The genomic region to select from the dataframe in UCSC-style genomic
        region string, or triple (chrom, start, end).

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    Returns
    -------
    df : pandas.DataFrame

    Notes
    -----
    See :func:`.core.stringops.parse_region()` for more information on region
    formatting.

    See also
    --------
    :func:`select_mask`
    :func:`select_indices`
    :func:`select_labels`
    )locr   r.   s      r*   r
   r
   t   s    > 6+b&$//00r+   bothc                    |t                      n|\  }}}t          j        | d|||g           ||t          d          |I|dk     rt          d          d|dz
  z  | |         j        | |         j        z
  z  }| j        ||g         }	n8|'t          |t                    st          d	          |}nt          d
          |                                 }
|dk    s|dk    r| |         j        |z
  |
|<   |dk    s|dk    r| |         |z   |
|<   ||dk     r| |         j        d| |         j        | |         j        z
  z  	                    t          j                  z   }t          j        |
|         j        |          |
|<   t          j        |
|         j        |          |
|<   |C|
||g                                         |
||g<   |
||g         	                    |	          |
||g<   |
S )a@  
    Expand each interval by an amount specified with `pad`.

    Negative values for pad shrink the interval, up to the midpoint.
    Multiplicative rescaling of intervals enabled with scale. Only one of pad
    or scale can be provided. Often followed by :func:`trim()`.

    Parameters
    ----------
    df : pandas.DataFrame

    pad : int, optional
        The amount by which the intervals are additively expanded *on each side*.
        Negative values for pad shrink intervals, but not beyond the interval
        midpoint. Either `pad` or `scale` must be supplied.

    scale : float, optional
        The factor by which to scale intervals multiplicatively on each side, e.g
        ``scale=2`` doubles each interval, ``scale=0`` returns midpoints, and
        ``scale=1`` returns original intervals. Default False.
        Either `pad` or `scale` must be supplied.

    side : str, optional
        Which side to expand, possible values are 'left', 'right' and 'both'.
        Default 'both'.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. Default values are 'chrom', 'start', 'end'.

    Returns
    -------
    df_expanded : pandas.DataFrame

    Notes
    -----
    See :func:`bioframe.trim` for trimming interals after expansion.

    NTraise_errorsr"   z(only one of pad or scale can be suppliedr   z multiplicative scale must be >=0g      ?r   zadditive pad must be integerz$either pad or scale must be suppliedr3   leftright)r   r   is_bedframer   valuesdtypes
isinstanceintcopyastyper   int64minimummaximumround)r    padscalesider"   r#   r$   r%   padstypesdf_expandedmidss               r*   r   r      s   R -1L&(((dJBB
rBB<@@@@S_CDDD		199?@@@eai BrFMBrFM$AB	2r(#	#s## 	=;<<<?@@@''))Kv~~R&-$.Bv~~R&4-B
77b6=C2b6=2b6=+H$I#Q#Q$ $ D !jR)?FFKO jR)?FFKO +RH 5 ; ; = =RH +RH 5 < <U C CRHr+   r7   c           
         |t                      n|\  }}}|t                      n|\  }	}
}t          | |||g           t          ||	|
|g           |                     d          } |                    d          }|g}|	g}|
||z  }||z  }|                     |dd          j        }|                    |dd          j        }t
                              t          |          t          |                    }| |         j        }| |         j        }||
         j        }||         j        }g }|D ]}||v r||         nt          j	        g           }||v r||         nt          j	        g           }g }|j
        dk    o
|j
        dk    }|rWt          j        ||         ||         ||         ||                   }|||dddf                  ||dddf                  ggz  }|dv r||j
        dk    rq|rP|t          j        t          j        |dddf         t          |          	          dk              d                  }n|}||d
t          j        |          z  ggz  }|dv r||j
        dk    rq|rP|t          j        t          j        |dddf         t          |          	          dk              d                  }n|}|d
t          j        |          z  |ggz  }|r1|                    t          j        d |D                                  t          |          dk    rt          j        dt(                    S t          j        |          }|S )a  
    Find pairs of overlapping genomic intervals and return the integer
    indices of the overlapping intervals.

    Parameters
    ----------
    df1, df2 : pandas.DataFrame
        Two sets of genomic intervals stored as a DataFrame.

    how : {'left', 'right', 'outer', 'inner'}, default 'left'
        How to handle the overlaps on the two dataframes.
        left: use the set of intervals in df1
        right: use the set of intervals in df2
        outer: use the union of the set of intervals from df1 and df2
        inner: use intersection of the set of intervals from df1 and df2

    cols1, cols2 : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    on : list or None
        Additional shared columns to consider as separate groups.

    Returns
    -------
    overlap_ids : numpy.ndarray
        The indices of the overlapping genomic intervals in the original
        dataframes. The 1st column contains the indices of intervals
        from the 1st set, the 2nd column - the indicies from the 2nd set.
    NTdropF)observeddropnar   r   )outerr7   )	minlength)rP   r8   c                 &    g | ]}d  |D             S )c                 &    g | ]}|d d d f         S N ).0idxss     r*   
<listcomp>z/_overlap_intidxs.<locals>.<listcomp>.<listcomp>j  s$    ===4aaag===r+   rV   )rW   	idxs_pairs     r*   rY   z$_overlap_intidxs.<locals>.<listcomp>i  s6       % >=9===  r+   r      shapedtype)r   r   reset_indexgroupbyindicessetunionr:   r   arraysizer   overlap_intervalswherebincountlen	ones_likeappendblockndarrayr=   vstack)df1df2howcols1cols2onck1sk1ek1ck2sk2ek2group_list1group_list2
df1_groups
df2_groups
all_groupsstarts1ends1starts2ends2overlap_intidxs
group_keysdf1_group_idxsdf2_group_idxsoverlap_intidxs_subboth_groups_nonemptyoverlap_idxs_locno_overlap_ids1no_overlap_ids2s                                 r*   _overlap_intidxsr      s,   D 05})+++%MCc/4})+++%MCcC#sC)))C#sC))) //t/
$
$C
//t/
$
$C %K%K	~rr[4FFNJ[4FFNJ3z??C
OO<<J #hoGHOE#hoGHOE O  K K
'1Z'?'?Jz""bhrll 	 (2Z'?'?Jz""bhrll 	 ! . 3a 7Vn>QTU>U 	%7'n%'n%	     "#3AAAqD#9:"#3AAAqD#9:$  ###(;a(?(?# 
1"0H,QQQT2c.>Q>Q    
 # #1#o666$  $$$)<q)@)@# 
1"0H,QQQT2c.>Q>Q    
 # #1o666#$   	"" )<      ?q  zc2222i00Or+   c                     	 t          j        |           } n# t          $ r | cY S w xY wt                              | |           S rU   )r   r_   	TypeErrorNUMPY_INT_TO_DTYPEgetr_   s    r*   _to_nullable_dtyper     sP       !!%///s    &&TF _c           	      J	   |t                      n|\  }}}|	t                      n|	\  }}}t          j        | d|||g           t          j        |d|||g           |dk    r|d}|dk    r|rt          d          |
[t	          |
t
                    st          d          ||
v s||
v rt          d          t          | |
           t          ||
           t          | ||||	|
          }|ddd	f         }|ddd
f         }t	          |t                    r|nd}|d	         z   }|d
         z   }t          j
        || j        |         it          j                              }t          j
        ||j        |         it          j                              }d}|rt	          |t                    r|nd}|dz   |z   }|dz   |z   }t          j        | |         j        |         ||         j        |                   }t          j        | |         j        |         ||         j        |                   }t          j
        ||||i          }d} d}!|st          |          dk    s|dk    r9| j        |                             d          } fd| j        D             | _        |st          |          dk    s|dk    r9|j        |                             d          }!fd|!j        D             |!_        |dk    r2|dk    }"|dk    }#|"                                }$|#                                }%d||"<   d||#<   | f|r]|$r[|                     |d	         z   t-          | |         j                  |d	         z   t-          | |         j                  i          } |$rd| |"<   |!f|r]|%r[|!                    |d
         z   t-          ||         j                  |d
         z   t-          ||         j                  i          }!|%rd|!|#<   |"|r |$s|%r|                                }d||"|#z  <   t          j        || ||!|gd          }&|r|&                    ||g          }&|s|&                    ||gd
d           |&                    dd           |&S )an  
    Find pairs of overlapping genomic intervals.

    Parameters
    ----------
    df1, df2 : pandas.DataFrame
        Two sets of genomic intervals stored as a DataFrame.

    how : {'left', 'right', 'outer', 'inner'}, default 'left'
        How to handle the overlaps on the two dataframes.
        left: use the set of intervals in df1
        right: use the set of intervals in df2
        outer: use the union of the set of intervals from df1 and df2
        inner: use intersection of the set of intervals from df1 and df2

    return_input : bool, optional
        If True, return columns from input dfs. Default True.

    return_index : bool, optional
        If True, return indicies of overlapping pairs as two new columns
        ('index'+suffixes[0] and 'index'+suffixes[1]). Default False.

    return_overlap : bool, optional
        If True, return overlapping intervals for the overlapping pairs
        as two additional columns (`overlap_start`, `overlap_end`).
        When `cols1` is modified, `start` and `end` are replaced accordingly.
        When `return_overlap` is a string, its value is used for naming the overlap
        columns: `return_overlap + "_start"`, `return_overlap + "_end"`.
        Default False.

    suffixes : (str, str), optional
        The suffixes for the columns of the two overlapped sets.

    keep_order : bool, optional
        If True and how='left', sort the output dataframe to preserve the order
        of the intervals in df1. Cannot be used with how='right'/'outer'/'inner'.
        Default True for how='left', and None otherwise.
        Note that it relies on sorting of index in the original dataframes,
        and will reorder the output by index.

    cols1, cols2 : (str, str, str) or None, optional
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    on : list or None, optional
        List of additional shared columns to consider as separate groups
        when considering overlaps. A common use would be passing on=['strand'].
        Default is None.

    ensure_int : bool, optional [default: True]
        If True, ensures that the output dataframe uses integer dtypes for
        start and end coordinates. This may involve converting coordinate
        columns to nullable types in outer joins. Default True.

    Returns
    -------
    df_overlap : pandas.DataFrame

    Notes
    -----
    If ``ensure_int`` is False, inner joins will preserve coordinate dtypes
    from the input dataframes, but outer joins will be subject to native type
    casting rules if missing data is introduced. For example, if `df1` uses a
    NumPy integer dtype for `start` and/or `end`, the output dataframe will
    use the same dtype after an inner join, but, due to casting rules, may
    produce ``float64`` after a left/right/outer join with missing data stored
    as ``NaN``. On the other hand, if `df1` uses Pandas nullable dtypes, the
    corresponding coordinate columns will preserve the same dtype in the
    output, with missing data stored as ``NA``.
    NTr5   r7   z+keep_order=True only allowed for how='left'on=[] must be None or list,on=[] should not contain chromosome colnamesrr   rs   rt   ru   r   r   r0   r   r   r   1rL   c                 &    g | ]}|d          z   S r   rV   rW   csuffixess     r*   rY   zoverlap.<locals>.<listcomp>  !    JJJ!a(1+oJJJr+   2r8   c                 &    g | ]}|d          z   S r   rV   r   s     r*   rY   zoverlap.<locals>.<listcomp>   r   r+   innerrR   columnsaxis)r   inplacerM   r   )r   r   r9   r   r<   listr   r   strpd	DataFramer0   
Int64Dtyper   rB   r:   rA   ilocr`   r   anyr?   r   r_   convert_dtypesconcatsort_valuesrM   )'rp   rq   rr   return_inputreturn_indexreturn_overlapr   
keep_orderrs   rt   ru   
ensure_intrv   rw   rx   ry   rz   r{   overlap_df_idxsevents1events2	index_colindex_col_1index_col_2
df_index_1
df_index_2
df_overlapoverlap_coloverlap_col_sk1overlap_col_ek1overlap_startoverlap_end
df_input_1
df_input_2
is_na_leftis_na_rightany_na_leftany_na_rightout_dfs'         `                                r*   r   r     sy   j 05})+++%MCc/4})+++%MCc
sS#sODDDD
sS#sODDDDvJ.
v:FGGG	~"d## 	;9:::2II3"99KLLLR   R   &  O aaad#Gaaad#G !+< = =J7Ihqk)Khqk)K{CIg,>?r}WWWJ{CIg,>?r}WWWJJ 
(2>3(G(GVnnY%+c1%+c1
HOG$HOG$
 

 jHOG$HOG$
 

 \m_kJ
 

 JJ Ks<((C//<63I3IXg&222==
JJJJz7IJJJ
 Ks<((C//<73J3JXg&222==
JJJJz7IJJJ
 g~~]
m nn&&"((!%
:"&
;! k '..hqk)+=c#hn+M+Mhqk)+=c#hn+M+M 
  .)-
:&! l '..hqk)+=c#hn+M+Mhqk)+=c#hn+M+M 
  /*.
;'! <{ <l <'6688
7;
:34Y	ZZD9  F  @##[+$>?? F[+.QEEE
D$///Mr+   c                 .	   ||dk     rt          d          |t                      n|\  t          | g           | j        }|                     d          } g}|Lt          |t                    st          d          |v rt          d          t          | |           ||z  }|                     |d          j        }	t          j
        | j        d         d	          }
g }d	}|	                                D ]\  }}t          j        t          j        |                                                    r?|j        rG| j        |         }t'          j        |         j                            t          j                  |         j                            t          j                  |
          \  }}}t          j        |          }||dz   z  }|j        d         }||z  }||
|j        <   t          |t2                    r|f}i }|D ]R}t          j        t          j
        |||                    |                             | |         j                  ||<   S||<   ||<   ||d<   t          j        |          }|                    |           t          j        | g|                                       d          }|                                dk    rY|dz   t          j        t          j        |j                            z   |
|j        <   |                    | j        |                    t          j         |                              d          }|                                dk    r;|                    t          j!                    t          j!                    i          }t          j"        |
dk              sJ t          |#                                          }|gfd|D             z            }i }|r|
|d<   |r,|         j        |
         |d<   |         j        |
         |d<   t          j        |          }|rt          j         | |gd          }|$                    |           |S )a  
    Cluster overlapping intervals into groups.

    Can return numeric ids for these groups (with `return_cluster_ids`=True)
    and/or their genomic coordinates (with `return_cluster_intervals`=True).
    Also see :func:`merge()`, which discards original intervals and returns a
    new set.

    Parameters
    ----------
    df : pandas.DataFrame

    min_dist : float or None
        If provided, cluster intervals separated by this distance or less.
        If ``None``, do not cluster non-overlapping intervals.
        Since bioframe uses semi-open intervals, interval pairs [0,1) and [1,2)
        do not overlap, but are separated by a distance of 0. Such adjacent
        intervals are not clustered when ``min_dist=None``, but are clustered
        when ``min_dist=0``.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    on : None or list
        List of column names to perform clustering on independently, passed as
        an argument to df.groupby before clustering. Default is ``None``.
        An example useage would be to pass ``on=['strand']``.

    return_input : bool
        If True, return input

    return_cluster_ids : bool
        If True, return ids for clusters

    return_cluster_invervals : bool
        If True, return clustered interval the original interval belongs to

    Returns
    -------
    df_clustered : pd.DataFrame

    Nr   min_dist>=0 currently requiredTrL   r   r   rN   rR   min_distr   datar_   n_intervalsr   c                 "    g | ]}|fv	|S rV   rV   rW   colr#   r%   r$   s     r*   rY   zcluster.<locals>.<listcomp>  )    QQQRRL9P9P9P9P9Pr+   r   cluster_startcluster_endr   )%r   r   r   r0   r`   r<   r   ra   groupsr   fullr^   itemsr   isnaSeriesr   emptyr2   r   merge_intervalsr:   r?   r@   ri   r   r_   r   rl   isnullsumaranger   r   allkeys	set_index)r    r   r"   ru   r   return_cluster_idsreturn_cluster_intervalsdf_index
group_list	df_groupscluster_idsclustersmax_cluster_idr   df_group_idxsdf_groupcluster_ids_groupcluster_starts_groupcluster_ends_groupinterval_counts
n_clustersclusters_groupr   df_nansclusters_namesr   r#   r%   r$   s                             @@@r*   r   r   S  s   h a<<=>>>,0L&(((dJBBBR%%% xH	T	"	"B J	~"d## 	;9:::88KLLLBb


:
55<I'"(1+r**KHN%.__%6%6 %( %(!
M729Z(())--// 	 	6-(
 "RL&&rx00RL&&rx00
 
 
		
  +&788^a//)/2
*$,=M() j#&& 	'$J 	 	C"$)WZJ4D4DS4I4I)JKKgm# # #N3 2r/r(7}%n55''''iB0Z012266A6>>G{{}}qQ26'.+A+A!B!BB 	GN# 	w(((y""..D.99H{{}}q??BR]__#MNN6+"##### (--//**N	RQQQQQQ~QQQQH F ('y A"*2,"5k"B ( 3K @}\&!!F 9B<i888
XMr+   c                    ||dk     rt          d          |t                      n|\  t          j        | dg           |                                 } |                     dd           g}|Lt          |t                    st          d          |v rt          d          t          | |           ||z  }| 	                    |d	          j
        }g }|                                D ]\  }}t          j        t          j        |                                                    r?|j        rG| j        |         }	t%          j        |	         j                            t,          j                  |	         j                            t,          j                  |
          \  }
}}t-          j        |
          }|j        d         }t          |t4                    r|f}i }|D ]R}t          j        t-          j        |||                    |                             | |         j                  ||<   S||<   ||<   ||d<   t          j        |          }|                    |           t          j         | g|                                       d          }|!                                }|rlt          j        t          j"        g|z  dg| j        |         j                  }|                    t          j#        | j        |         |gd                     t          j#        |                              d          }|rN|                    t          j$                    t          j$                    dt          j$                    i          }t          |%                                          }|gfd|D             z            }|S )a  
    Merge overlapping intervals.

    This returns a new dataframe of genomic intervals, which have the genomic
    coordinates of the interval cluster groups from the input dataframe. Also
    :func:`cluster()`, which returns the assignment of intervals to clusters
    prior to merging.

    Parameters
    ----------
    df : pandas.DataFrame

    min_dist : float or None
        If provided, merge intervals separated by this distance or less.
        If None, do not merge non-overlapping intervals. Using
        ``min_dist=0`` and ``min_dist=None`` will bring different results.
        bioframe uses semi-open intervals, so interval pairs [0,1) and [1,2)
        do not overlap, but are separated by a distance of 0. Adjacent intervals
        are not merged when ``min_dist=None``, but are merged when ``min_dist=0``.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    on : None or list
        List of column names to perform clustering on independently, passed as
        an argument to df.groupby before clustering. Default is None.
        An example useage would be to pass ``on=['strand']``.

    Returns
    -------
    df_merged : pandas.DataFrame
        A pandas dataframe with coordinates of merged clusters.

    Notes
    -------
    Resets index.

    Nr   r   Tr5   r   rM   r   r   r   r   r   r   r   r   )r   r0   rL   c                 "    g | ]}|fv	|S rV   rV   r   s     r*   rY   zmerge.<locals>.<listcomp>h  r   r+   )&r   r   r   r9   r>   r`   r<   r   r   ra   r   r   r   r   r   r   r   r2   r   r   r:   r?   r   r@   ri   r^   r   r   r0   r_   r   rl   r   r   NAr   r   r   )r    r   r"   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   df_has_nansnan_intervalsr   r#   r%   r$   s                        @@@r*   r   r     s    R a<<=>>> -1L&(((dJBB
rBB<@@@@	BNN4dN+++ J	~"d## 	;9:::88KLLLBb


:
55<IH%.__%6%6 "( "(!
M729Z(())--// 	 	6-(
 "RL&&rx00RL&&rx00
 
 
		
  +&788)/2
 j#&& 	'$J 	 	C"$)WZJ4D4DS4I4I)JKKgm# # #N3 2r/r(7}%n55''''iB0Z012266A6>>G++--K 
UGk!"O&/'
 
 

 	I-0  	
 	
 	
 y""..D.99H 
??"bmoo}bmooV
 

 (--//**N	RQQQQQQ~QQQQH Or+   c                 l   |t                      n|\  }}}|t                      n|\  }	}
}|                     dd           t          ||          }t          | |d|ddd||	  	        }|d|          |d|          z
  |d<   t	          j        |                    d	|d
         z                                 ddi          d                             | |         j	                            
                    ddi                              d          }|rt	          j        | |gd          }|S )a  
    Quantify the coverage of intervals from 'df1' by intervals from 'df2'.

    For every interval in 'df1' find the number of base pairs covered by
    intervals in 'df2'. Note this only quantifies whether a basepair in 'df1'
    was covered, as 'df2' is merged before calculating coverage.

    Parameters
    ----------
    df1, df2 : pandas.DataFrame
        Two sets of genomic intervals stored as a DataFrame.

    suffixes : (str, str)
        The suffixes for the columns of the two overlapped sets.

    return_input : bool
        If True, return input as well as computed coverage

    cols1, cols2 : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    Returns
    -------
    df_coverage : pandas.DataFrame

    Notes
    ------
    Resets index.

    NTr   r"   r7   )rr   r   r   r   r   rs   rt   overlap_r   r0   r   r   r   r   rL   r   r   )r   r`   r   r   r   r   ra   aggr?   r_   renamer   )rp   rq   r   r   rs   rt   rv   rw   rx   ry   rz   r{   
df2_mergedr   r   s                  r*   r   r   n  sh   R 05})+++%MCc/4})+++%MCcOODtO,,,s'''J
 
 
J ''7#'7'78:FVQTFVFV;WWJy 	w!455S)U#$$Y0VCHN##	
 	

 
J/	0	0	$		   :C=y999Mr+   c
                    |t                      n|\  }
}}|	t                      n|	\  }}}d}||| u r&t          |           dk    rt          d          | }d}|                     d          } |                    d          }|                     |
d          j        }|                    |d          j        }g }|                                D ]M\  }}||vrFt          j        |dt          j	        |          z  g          j
        }|                    |           P||         }| j        |         }|j        |         }d}t          |t                    r||         j        }n/t!          |          r ||          j        }nt          d	           d}|.t          j        t          |          t          j        
          }n||         j        dk    }t'          j        ||         j        ||         j        |rdn||         j        |rdn||         j        ||||||
  
        }t          j        t          j        t          |                    |dddf         d          }t          j        t          j        |j        |dddf                  |j        |dddf                  g          j
        t          j        |j        |         dt          j	        |j        |                   z  g          j
        g          }|                    |           Ot          |          dk    rt          j        dt2                    S t          j        |          }|S )a
  
    For every interval in set 1 find k closest genomic intervals in set2 and
    return their integer indices.

    Parameters
    ----------
    df1, df2 : pandas.DataFrame
        Two sets of genomic intervals stored as a DataFrame.
        If df2 is None or same object as df1, find closest intervals within
        the same set.

    k_closest : int
        The number of closest intervals to report.

    cols1, cols2 : (str, str, str)
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    Returns
    -------
    closest_ids : numpy.ndarray
        The indices of the overlapping genomic intervals in the original
        dataframes. The 1st column contains the indices of intervals
        from the 1st set, the 2nd column - the indicies from the 2nd set.
        The second column is filled with -1 for those intervals in the 1st
        set with no closest 2nd set interval.
    NFr   zKdf1 must have more than one interval to find closest non-identical intervalTrL   r   rR   zHtie_breaking_col must be either a column label or f(DataFrame) -> Seriesr   -)ktie_arrignore_overlapsignore_upstreamignore_downstream	directionr   )invertr[   r]   )r   rj   r   r`   ra   r   r   r   ro   rk   Trl   r2   r<   r   r:   callableonesbool_r   closest_intervalsisinr   concatenatern   r=   )rp   rq   r  r  r  r  direction_coltie_breaking_colrs   rt   rv   rw   rx   ry   rz   r{   self_closestr~   r   closest_intidxsr   r   closest_idxs_groupr   	df1_group	df2_groupr  direction_arrna_idxss                                r*   _closest_intidxsr#    s   T 05})+++%MCc/4})+++%MCcLs88q==)    //t/
$
$C
//t/
$
$C S4007JS4007JO&0&6&6&8&8 M3 M3"
NZ''!#"n555" "
   ""#5666#J/GN+	GN+	&,, 	 018GG&'' 	&&y118GG)    GC	NN"(CCCMM -(/36 
 $5cN!cN! ;DDin&; ;DDin&;++/#
 
 
 'Ic.))**,>qqq!t,DT
 
 
  ^	&-.@A.FG&-.@A.FG 
 	&-g6R\.*?*HIII 
 
 
" 	12222
?q  zc2222i00Or+   c                 	   |dk     rt          d          || u rt          d          || }|t                      n|\  }}}|t                      n|\  }}}t          j        | d|||g           t          j        |d|||g           t	          | |||||||||
  
        }|dddf         dk    }d}d}|	rt          |	t                    r|	nd	}t          j        |d
         z   | j	        |ddd
f                  i          }t          j        |d         z   |j	        |dddf                  i          }t          j
        ||<   d}|rat          j        t          j        | |         j        |ddd
f                  ||         j        |dddf                  g          d
          }t          j        t          j        | |         j        |ddd
f                  ||         j        |dddf                  g          d
          }||k     }t          j        |t          j        ||d          t          j        ||d          d          }|                    t          j                    t          j                    t          j                    d          }t          j
        ||<   d}|
rt          j        d
| |         j        |ddd
f                  ||         j        |dddf                  z
            }t          j        d
||         j        |dddf                  | |         j        |ddd
f                  z
            } t          j        t          j        || g          d
          }!t          j        d|!it          j                              }t          j
        ||<   d}"d}#|st          |          dk    s|dk    rC| j        |ddd
f                                      d          }"fd|"j        D             |"_        |st          |          dk    s|dk    r|j        |dddf                                      d          }#fd|#j        D             |#_        |#                    |d         z   t          j                    |d         z   t          j                    i          }#t          j
        |#|<   t          j        ||"||#||gd          }$|$S )aY  
    For every interval in dataframe `df1` find k closest genomic intervals in
    dataframe `df2`.

    Currently, we are not taking the feature strands into account for filtering.
    However, the strand can be used for definition of upstream/downstream of
    the feature (direction).

    Note that, unless specified otherwise, overlapping intervals are considered
    as closest. When multiple intervals are located at the same distance, the
    ones with the lowest index
    in `df2` are returned.

    Parameters
    ----------
    df1, df2 : pandas.DataFrame
        Two sets of genomic intervals stored as a DataFrame.
        If `df2` is None, find closest non-identical intervals within the same
        set.

    k : int
        The number of the closest intervals to report.

    ignore_overlaps : bool
        If True, ignore overlapping intervals and return the closest
        non-overlapping interval.

    ignore_upstream : bool
        If True, ignore intervals in `df2` that are upstream of intervals in
        `df1`, relative to the reference strand or the strand specified by
        direction_col.

    ignore_downstream : bool
        If True, ignore intervals in `df2` that are downstream of intervals
        in `df1`, relative to the reference strand or the strand specified by
        direction_col.

    direction_col : str
        Name of direction column that will set upstream/downstream orientation
        for each feature. The column should contain bioframe-compliant strand
        values ("+", "-", ".").

    tie_breaking_col : str
        A column in `df2` to use for breaking ties when multiple intervals
        are located at the same distance. Intervals with *lower* values will
        be selected.

    return_input : bool
        If True, return input

    return_index : bool
        If True, return indices

    return_distance : bool
        If True, return distances. Returns zero for overlaps.

    return_overlap : bool
        If True, return columns: 'have_overlap', 'overlap_start', and
        'overlap_end'. Fills df_closest['overlap_start'] and df['overlap_end']
        with None if non-overlapping. Default False.

    suffixes : (str, str)
        The suffixes for the columns of the two sets.

    cols1, cols2 : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.


    Returns
    -------
    df_closest : pandas.DataFrame
        If no intervals found, returns none.

    Notes
    -----
    By default, direction is defined by the reference genome: everything with
    smaller coordinate is considered upstream, everything with larger
    coordinate is considered downstream.

    If ``direction_col`` is provided, upstream/downstream are relative to the
    direction column in ``df1``, i.e. features marked "+" and "." strand will
    define upstream and downstream as above, while features marked "-" have
    upstream and downstream reversed: smaller coordinates are downstream and
    larger coordinates are upstream.
    r   zk>=1 requiredzJpass df2=None to find closest non-identical intervals within the same set.NTr5   )r  r  r  r  r  r  rs   rt   rR   r0   r   r   )have_overlapr   r   distancer   r   r7   rL   c                 &    g | ]}|d          z   S r   rV   r   s     r*   rY   zclosest.<locals>.<listcomp>  r   r+   r   r8   c                 &    g | ]}|d          z   S r   rV   r   s     r*   rY   zclosest.<locals>.<listcomp>"  r   r+   r   )r   r   r   r9   r#  r<   r   r   r   r0   r   r   amaxro   r:   aminrh   r?   BooleanDtyper   rB   r   r`   r   r   )%rp   rq   r  r  r  r  r  r  r   r   return_distancer   r   rs   rt   rv   rw   rx   ry   rz   r{   closest_df_idxsna_maskr   r   r   r   r   r   r%  df_distancedistance_leftdistance_rightr&  r   r   r   s%               `                        r*   r   r   P  s|   R 	1uu)))
czzX
 
 	

 {/4})+++%MCc/4})+++%MCc
sS#sODDDD
sS#sODDDD&
''+#)  O aaad#r)G JJ $$.|S$A$ANLLw	\!$ci10E&FG
 

 \!$ci10E&FG
 

 !e
7J #$IHOOAAAqD$9:HOOAAAqD$9:  
 
 
 gIHOOAAAqD$9:HOOAAAqD$9:  
 
 
 %{2\ ,!#,t!L!L!xk4HH 
 

  && " 1 1!#!} 
 

 !e
7K %
HOOAAAqD12#hooaaad345
 

 HOOAAAqD12#hooaaad345
 

 729m^%DEEANNNlJ#9QQQ!uGJJ Ks<((C//<63I3IXoaaad34@@d@KK
JJJJz7IJJJ
 $s<((C//<73J3JXoaaad34@@d@KK
JJJJz7IJJJ
&&8A;x{1BBMOOT
 

 !e
7Y	ZZ[Q  F
 Mr+   c                    |t                      n|\  |t                      n|\  }}}|d         z   dz   dz   i}	fdt          | j                  D             }
|
D ]}||	||d         z   <   |r.d|d         z   |	d|d         z   <   d|d         z   |	d|d         z   <   t          j        t          t          j        |                                                              t          t          j        ||                                                             z             }t          |          dk    rt          d          t          | t          |d	 |D             |
                              |t          j                    |t          j                    i          d||dd||	  	        t          |	                   }|                    |	d           |j        t          j        |         j                            }|                    dd           |r|d|d         z            j        }|d|d         z                                            }t          j        |          D ]2}|||k    xx         |||k                                             z  cc<   3|                                |d|d         z   <   |                    d|d         z   gd           |S )a  
    Generate a new set of genomic intervals by subtracting the second set of
    genomic intervals from the first.

    Parameters
    ----------
    df1, df2 : pandas.DataFrame
        Two sets of genomic intervals stored as a DataFrame.

    return_index : bool
        Whether to return the indices of the original intervals
        ('index'+suffixes[0]), and the indices of any sub-intervals split by
        subtraction ('sub_index'+suffixes[1]). Default False.

    suffixes : (str,str)
        Suffixes for returned indices. Only alters output if return_index is
        True. Default ("","_").

    cols1, cols2 : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    Returns
    -------
    df_subtracted : pandas.DataFrame

    Notes
    -----
    Resets index, drops completely subtracted (null) intervals, and casts to
    pd.Int64Dtype().

    Nr   r  c                 "    g | ]}|fv	|S rV   rV   )rW   irv   rx   rw   s     r*   rY   zsubtract.<locals>.<listcomp>b  s)    PPPQqc37O7Oq7O7O7Or+   r0   complement_indexr   z*No chromosomes remain after dropping nullsc                 V    i | ]&}|t          j        t           j                  j        'S rV   r   iinfor@   maxrW   r4  s     r*   
<dictcomp>zsubtract.<locals>.<dictcomp>r  s)    HHH!RXbh//3HHHr+   )view_dfr"   r7   T)rr   r   r   r   r   rs   rt   r   r   r   	sub_index)r   r   r   r   uniquer   rO   rj   r   r   r   r?   r   r  r   r   r:   r`   r>   minrM   )rp   rq   r   r   rs   rt   ry   rz   r{   name_updatesextra_columns_1r4  
all_chromsdf_subtractedinds	comp_indsrv   rx   rw   s                   @@@r*   r   r   0  s*   T 05})+++%MCc/4})+++%MCc 	hqk3S#S#L
 QPPPPP$s{"3"3PPPO * *()Q!_%% O.5.CWx{*+.@8A;.NWx{*+RYs3x(())**T")CHOO<M<M2N2N-O-OO J :!EFFFHHZHHHu	
 	
 	

&#r}R]__=
>
>!   <M t<<<!&c0B0I(J(J'JKM4666 UWx{23:!"4x{"BCHHJJ	4 	? 	?Adai   Idai$8$<$<$>$>>    3<>>3C3CkHQK/0$6!$D#EtTTTr+   c                    |t                      n|\  }}}|t                      n|\  }}	}
t          | |d|||          }t          j        t          j        t          |                     |dddf                   }| j        |         }|S )a`  
    Generate a new dataframe of genomic intervals by removing any interval
    from the first dataframe that overlaps an interval from the second
    dataframe.

    Parameters
    ----------
    df1, df2 : pandas.DataFrame
        Two sets of genomic intervals stored as DataFrames.

    cols1, cols2 : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each dataframe.
        The default values are 'chrom', 'start', 'end'.

    on : None or list
        Additional column names to perform clustering on independently, passed
        as an argument to df.groupby when considering overlaps and must be
        present in both dataframes.
        Examples for additional columns include 'strand'.

    Returns
    -------
    df_setdiff : pandas.DataFrame

    Nr   r   r   )r   r   r   	setdiff1dr   rj   r   )rp   rq   rs   rt   ru   rv   rw   rx   ry   rz   r{   df_overlappedinds_non_overlapped
df_setdiffs                 r*   r   r     s    6 05})+++%MCc/4})+++%MCc$SgU%B  M ,ryS':':M!!!Q$<OPP-.Jr+   c                 N   |                      dd           t          | |ddd|d|||
  
        }t          j        |                    d|d         z   g          d|d         z                                            j        d	g
          }|rt          j        | |gd          }|S )az  
    Count number of overlapping genomic intervals.

    Parameters
    ----------
    df1, df2 : pandas.DataFrame
        Two sets of genomic intervals stored as a DataFrame.

    suffixes : (str, str)
        The suffixes for the columns of the two overlapped sets.

    return_input : bool
        If True, return columns from input dfs. Default True.

    cols1, cols2 : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    on : list
        List of additional shared columns to consider as separate groups
        when considering overlaps. A common use would be passing on=['strand'].
        Default is None.

    Returns
    -------
    df_counts : pandas.DataFrame

    Notes
    -------
    Resets index.

    Tr   r7   F)rr   r   r   r   r   ru   rs   rt   r0   r   r   countr  r   r   )r`   r   r   r   ra   rM  r:   r   )	rp   rq   r   r   rs   rt   ru   	df_countsr   s	            r*   r   r     s    V OODtO,,,  I \7Xa[01227Xa[3HI				  F  :C=y999Mr+   namec           
          |t                      n|\  }}}	t          | |||	g           t          | j                  }
|                                 }d}|L|}d t          | |                                                                         j                  D             }d}|t                      n|\  }}}t          j	        |||||g          
                    t          t          |||g|||	g                              }|r|}ng|;t          |dgd          rt          d	          d}t          ||d||||
          }n*t          ||g           t          j        ||d||           |                    |d||d          }t%          j        ||         j                  }|                                rTt$          j        |j        ||||	gf<   |                    |t%          j                    |	t%          j                    i           ||dz            j        }||	dz            j        }||                             ||          ||<   ||	                             ||          ||	<   |r|S ||
         S )ac  
    Trim each interval to fall within regions specified in the viewframe 'view_df'.

    Intervals that fall outside of view regions are replaced with nulls.
    If no 'view_df' is provided, intervals are truncated at zero to avoid
        negative values.

    Parameters
    ----------
    df : pandas.DataFrame

    view_df : None or pandas.DataFrame
        View specifying region start and ends for trimming. Attempts to
        convert dictionary and pd.Series formats to viewFrames.

    df_view_col : str or None
        The column of 'df' used to specify view regions.
        The associated region in 'view_df' is then used for trimming.
        If None, :func:'bioframe.ops.assign_view' will be used to assign view
        regions. If no 'view_df' is provided, uses the 'chrom' column,
        df[cols[0]]. Default None.

    view_name_col : str
        Column of df with region names. Default 'name'.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    cols_view : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals in the view. The default values are 'chrom',
        'start', 'end'.

    Returns
    -------
    df_trimmed : pandas.DataFrame

    NFc                 V    i | ]&}|t          j        t           j                  j        'S rV   r7  r:  s     r*   r;  ztrim.<locals>.<dictcomp>/  s9     
 
 
 rx!!%
 
 
r+   Tview_name_colr"   r  view_regionreturn_as_bool-column view_region already exists in input df)drop_unassigneddf_view_colrS  r"   	cols_viewr6   rY  rS  r7   r   _view)rr   left_onright_onr   r]  )lowerupper)r   r   r   r   r>   rc   rO   r:   r   make_viewframer  dictzipr   r   r   is_catalogedr   r   r   r   r   r2   r?   r   clip)r    r<  rY  rS  return_view_columnsr"   rZ  r#   r$   r%   
df_columns
df_trimmedinferred_viewckvskvekvunassigned_intervalslower_vectorupper_vectors                      r*   r   r     s   b -1L&(((dJBBBR%%%bj!!JJM
 
K--//6688?@@
 
 
 /8/@)+++iMCc)}Cc?  fT#sCoB|<<==f>>   
		:tLLL 	NLMMM# !#'
 
 


 	
[M222#'	
 	
 	
 	
 !! "  J 9Z%<%CDD!! F=?U
+b"b\9:2r}BMOODEEEb7l+2Lb7l+2L^((|<(PPJrN^((|<(PPJrN &*%%r+   c           
      V   |t                      n|\  }}}t          | |||g           |6d t          | |                                         j                  D             }|t                      n|\  }}	}
t          j        ||||	|
g                              t          t          ||	|
g|||g                              }t          || ddd||          }||d	|z   d	|z   |g                                         }|                    d	|z   |d	|z   ||d
id           |} t          j        | |dd
|           |                     d
          j        }t!          t          ||                             }g }|D ]}|j        ||         |k             }||||g         j        d         \  }}}||vrR|                                                    |d
i          }|                    t'          j        |                     ||         j        }| j        |         }t+          j        ||         j                            t0          j                  ||         j                            t0          j                  ||f          \  }}|t'          j        t1          j        |j        d         |          | |         j                  ||||d
|i}t'          j        |          }|                    |           t'          j        |                              d          }|S )a  
    Find genomic regions in a viewFrame 'view_df' that are not covered by any
    interval in the dataFrame 'df'.

    First assigns intervals in 'df' to region in 'view_df', splitting intervals
    in 'df' as necessary.

    Parameters
    ----------
    df : pandas.DataFrame

    view_df : pandas.Dataframe
        If none, attempts to infer the view from chroms (i.e. df[cols[0]]).

    view_name_col : str
        Name of column in view_df with unique reigon names. Default 'name'.

    cols : (str, str, str)
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    cols_view : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals in the view. The default values are 'chrom',
        'start', 'end'.

    Returns
    -------
    df_complement : pandas.DataFrame

    Notes
    ------
    Discards null intervals in input, and df_complement has regular int dtype.

    Nc                 V    i | ]&}|t          j        t           j                  j        'S rV   r7  r:  s     r*   r;  zcomplement.<locals>.<dictcomp>  s)    RRR1bhrx((,RRRr+   rR  r  Tr   )r   _df)r   rr   r   rs   rt   r  rT  r=  r[  r   )boundsr   rL   ) r   r   rc   rO   r:   r   rb  r  rc  rd  r   r>   r   re  ra   r   sortedr2   rl   r   r   r   complement_intervalsr?   r   r@   r   r   r^   r_   r   r`   )r    r<  rS  r"   rZ  r#   r$   r%   rk  rl  rm  new_intervalsr   r   complements	group_keyregion_intervalregion_chromregion_start
region_endcomplement_groupr   r   complement_starts_groupcomplement_ends_groups                            r*   r   r   m  s   P -1L&(((dJBBBR%%%RRc"R&--//:P6Q6QRRR/8/@)+++iMCc)}Cc?  fT#sCoB|<<==f>>  
  M "	Z"_j2o}=
dff  OROR=

      
B

!#    

=))0IGM23344JK #- #-	!+gm&<	&IJ1@"b"1N1UVW1X.lJI%%.3355<<&6  =     r|,<==>>>!),36-(
 'RL&&rx00RL&&rx00 *-
 
 
	
#! 	W4:1=|LLfl   '%9
 <(899+,,,,)K((44$4??Kr+   c           
      J   |t                      n|\  }}}	t          j        | |          st          d          |                                 }
||
                    |||	gd           nw|t                      n|\  }}}t          j        |||||g                              t          t          |||g|||	g                              }|:t          |
dgd	          rt          d
          d}t          |
|||||          }
nzt          |
|gd	          st          d          t          j        |
t          j        |
|                                                             |||          st          d          t          j        ||         j        d          }|
|                             ||i          |
|<   |
                    ||||	gd           |
| j                                     | j                  }
|r|
                    dd           |
S )a  
    Sorts a bedframe 'df'.

    If 'view_df' is not provided, sorts by ``cols`` (e.g. "chrom", "start", "end").
    If 'view_df' is provided and 'df_view_col' is not provided, uses
    :func:`bioframe.ops.assign_view` with ``df_view_col='view_region'``
    to assign intervals to the view regions with the largest overlap and then
    sorts.
    If 'view_df' and 'df_view_col' are both provided, checks if the latter are
    cataloged in 'view_name_col', and then sorts.

    df : pandas.DataFrame
        Valid bedframe.

    view_df : pandas.DataFrame | dict-like
        Valid input to make a viewframe. When it is dict-like
        :func:'bioframe.make_viewframe' will be used to convert
        to viewframe. If view_df is not provided df is sorted by chrom and start.

    reset_index : bool
        Default True.

    df_view_col: None | str
        Column from 'df' used to associate intervals with view regions.
        The associated region in 'view_df' is then used for sorting.
        If None, :func:'bioframe.assign_view' will be used to assign view regions.
        Default None.

    view_name_col: str
        Column from view_df with names of regions.
        Default `name`.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    cols_view : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals in the view. The default values are 'chrom',
        'start', 'end'.

    Returns
    -------
    out_df : sorted bedframe

    Notes
    -------
        df_view_col is currently returned as an ordered categorical

    Nr  z!not a valid bedframe, cannot sortT)r   rR  r  rT  rU  rW  )rY  rS  r"   rZ  z9column 'df_view_col' not in input df, cannot sort by view)rY  rS  z=intervals in df not cataloged in view_df, cannot sort by view)
categoriesorderedr   )r   r   r9   r   r>   r   r   rb  r  rc  rd  r   r   re  r   r   r   CategoricalDtyper:   r?   r   r;   r`   )r    r<  r`   rY  rS  r"   rZ  rv   rw   rx   r   rk  rl  rm  view_cats                  r*   r   r     s   v 04|)+++MCcbt,,, ><===WWYYFCc?D9999 4=3D-///)S#-=S#
 
 

&c3S/Cc?CCDD&
E
E 	 vtLLL R !PQQQ'K '+  FF #6K=NNN  O   &{ 3 < < > >???@'+	    !S   &}-4d
 
 
 %[188+x9PQQ{Kc37FFF BJ&&ry11F 44d333Mr+   rT  c                    |t                      n|\  }}}	|                                 } |                     dd           t          j        | d|           t          j        |||          }t          | |ddddd||		  	        }
|
d
|	z            |
d
|z            z
  |
d<   |
                    dd          	                    dd                              d          }|
                    |dz   |id           |r@|j        t          j        |                              d          j        dk    ddf         }|                    dd           g t!          | j                  |}||         S )a  
    Associates genomic intervals in bedframe ``df`` with regions in viewframe
    ``view_df``, based on their largest overlap.

    Parameters
    ----------
    df : pandas.DataFrame

    view_df : pandas.DataFrame
        ViewFrame specifying region start and ends for assignment. Attempts to
        convert dictionary and pd.Series formats to viewFrames.

    drop_unassigned : bool
        If True, drop intervals in df that do not overlap a region in the view.
        Default False.

    df_view_col : str
        The column of ``df`` used to specify view regions.
        The associated region in view_df is then used for trimming.
        If no view_df is provided, uses the chrom column, ``df[cols[0]]``.
        Default "view_region".

    view_name_col : str
        Column of ``view_df`` with region names. Default 'name'.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals. The default values are 'chrom', 'start', 'end'.

    cols_view : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals in the view. The default values are 'chrom',
        'start', 'end'.

    Returns
    -------
    out_df : dataframe with an associated view region for each interval in
    ``out_df[view_name_col]``.

    Notes
    -------
    Resets index.

    NTr   r5   rR  r7   r\  F)rr   r   r   r   r   rs   rt   r  overlap_length)	ascendingr0   first)keepr]  r=  r   r   r   )r   r>   r`   r   r9   r   rb  r   r   drop_duplicatesr  r   r   r   r   r:   r   r   )r    r<  rX  rY  rS  r"   rZ  rv   rw   rx   overlap_viewr   return_colss                r*   r   r   d  s   l 04|)+++MCc	BNN4dN+++
r48888)}9  G 

 
 
L 	Z#%&j36F)GG !"
 	  !1U CC	w	/	/	W		  MM=72K@$MOOO IRWV__00a088?1DaaaGH
t$///2D$$2k2K+r+   rU   )NNr3   N)r7   NNN)
r7   TFFr   NNNNT)r   NNTTT)r   NN)r   TNN)	Nr   FFFNNNN)Nr   FFFNNTFTFr   NN)Fr   NN)NNN)r   TNNN)NNrO  FNN)NrO  NN)NTNrO  NN)FrT  rO  NN)4numpyr   pandasr   corer   r   r   
core.specsr   r   core.stringopsr	   __all__r   r   r   r
   r   r   r_   int8	Int8Dtypeint16
Int16Dtypeint32
Int32Dtyper@   r   uint8
UInt8Dtypeuint16UInt16Dtypeuint32UInt32Dtypeuint64UInt64Dtyper   r   r   r   r   r   r#  r   r   r   r   r   r   r   r   rV   r+   r*   <module>r     s            . . . . . . . . . . > > > > > > > > ( ( ( ( ( (  *% % % %P8 8 8 8.3 3 3 301 1 1 1DK K K K\P P P Ph BHRW|r|~~BHRXBHRXBHRXBHRXBHRY))BHRY))BHRY))	 0 0 0 	

E E E ET 	!U U U Up@ @ @ @L 

J J J J^ 	

R R R Rn 	

] ] ] ]F 

W W W Wt# # # #R 

C C C CP 	t& t& t& t&n   H 	r r r rp 	^ ^ ^ ^ ^ ^r+   