
    DUf:                         d dl Zd dlZddlmZ ddlmZ ddlmZm	Z	m
Z
 g dZ	 	 dd	Z	 ddZddZddZ	 	 	 	 	 ddZddZ	 	 	 	 	 ddZ	 	 	 	 	 	 ddZdS )    N   )ops   )construction)_get_default_colnames_verify_column_dtypes_verify_columns)is_bedframeis_catalogedis_overlappingis_viewframeis_containedis_covering	is_tiling	is_sortedFc                 p   |t                      n|\  }}}t          | |||gd          s|rt          d          dS t          | |||gd          s|rt          d          dS t	          j        | |||g                   }|                    d	           |                    d	          z                                   r|rt          d
          dS | |         | |         z
  dk                                     r7|r3t          dt          | |         | |         z
  dk                d          dS dS )a  
    Checks that required bedframe properties are satisfied for dataframe `df`.

    This includes:

    - chrom, start, end columns
    - columns have valid dtypes
    - for each interval, if any of chrom, start, end are null, then all are
        null
    - all starts < ends.

    Parameters
    ----------
    df : pandas.DataFrame

    raise_errors : bool, optional [default: False]
        If True, raises errors instead of returning a boolean False for invalid
        properties.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    Returns
    -------
    is_bedframe:bool

    Notes
    -----
    Valid dtypes for chrom are object, string, or categorical.
    Valid dtypes for start and end are int/Int64Dtype.
    NTreturn_as_boolz&Invalid bedFrame: Invalid column namesF)colsr   z'Invalid bedFrame: Invalid column dtypesr   )axiszcInvalid bedFrame: Invalid null values (if any of chrom, start, end are null, then all must be null)r   z)Invalid bedframe: starts exceed ends for z
 intervals)
r   r	   	TypeErrorr   pdisnullanyall
ValueErrorsum)dfraise_errorsr   ck1sk1ek1nan_intervalss          Q/var/www/html/software/conda/lib/python3.11/site-packages/bioframe/core/checks.pyr
   r
      s   L 04|)+++MCc2S#tDDD  	FDEEEu 3S/$OOO  	GEFFFuIb#sC122M!$$
$}'8'8a'8'@'@
@AFFHH  	P   u	C2c7	a$$&&  	<3"S')Q.//< < <   u4    view_regionnamec                 6   t          | |gd          s|rt          d| d          dS t          ||gd          s|rt          d| d          dS t          | |                                                                         j                                      t          ||         j                            s[|rWt          | |         j                                      t          ||         j                            }t          d|           dS dS )	a}  
    Tests if all region names in `df[df_view_col]` are present in
    `view_df[view_name_col]`.

    Parameters
    ----------
    df : pandas.DataFrame

    view_df : pandas.DataFrame

    raise_errors : bool
        If True, raises errors instead of returning a boolean False for invalid
        properties. Default False.

    df_view_col: str
        Name of column from df that indicates region in view.

    view_name_col: str
        Name of column from view that specifies  region name.

    Returns
    -------
    is_cataloged:bool

    Notes
    -----
    Does not check if names in `view_df[view_name_col]` are unique.

    Tr   zCould not find `z` column in dfFz Could not find                 `z#`                 column in view_dfzIThe following regions in df[df_view_col] not in view_df[view_name_col]: 
)r	   r   setcopydropnavaluesissubset
difference)r   view_dfr   df_view_colview_name_colmissing_regionss         r$   r   r   Z   s\   @ 2}TBBB  	MKKKKLLLu7]ODIII  	$ ## # # $ $ $ ur+##%%,,..566??GM")**    	!"[/"899DDGM*122 O ?-<? ?   u4r%   c                    ddl m} |t                      n|\  }}} || |          }t          j        | |         | |         z
  j                  }t          j        ||         ||         z
  j                  }||k    rdS dS )a  
    Tests if any genomic intervals in a bioframe `df` overlap.

    Also see :func:`bioframe.ops.merge()`.

    Parameters
    ----------
    df : pandas.DataFrame

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    Returns
    -------
    is_overlapping:bool

    r   )mergeNr   TF)r   r4   r   npr   r,   )	r   r   r4   r    r!   r"   	df_mergedtotal_interval_lentotal_interval_len_mergeds	            r$   r   r      s    ( /3|)+++MCcbt$$$IC2c7!2 :;; "	#3(G'O P P555tur%   c                 2   |t                      n|\  }}}t          | ||||gd          s|rt          d          dS t          | |          s|rt	          d          dS t          j        |           j                                        r|rt	          d          dS t          t          | |                             t          | |         j                  k     r|rt	          d	          dS t          | |          r|rt	          d
          dS dS )a  
    Checks that `region_df` is a valid viewFrame.

    This includes:

    - it satisfies requirements for a bedframe, including columns for
      ('chrom', 'start', 'end')
    - it has an additional column, view_name_col, with default 'name'
    - it does not contain null values
    - entries in the view_name_col are unique.
    - intervals are non-overlapping

    Parameters
    ----------

    region_df : pandas.DataFrame
        Dataframe of genomic intervals to be tested.

    raise_errors : bool
        If True, raises errors instead of returning a boolean False for invalid
        properties. Default False.

    view_name_col : str
        Specifies column name of the view regions. Default 'name'.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    Returns
    -------
    is_viewframe:bool

    NTr   z"Invalid view: invalid column namesFr5   zInvalid view: not a bedframez Invalid view: cannot contain NAszPInvalid view: entries in                 region_df[view_name_col] must be uniquez-Invalid view: entries must be non-overlapping)r   r	   r   r
   r   r   isnar,   r   lenr)   r   )	region_dfr   r1   r   r    r!   r"   s          r$   r   r      sb   J 04|)+++MCcCc=1$     	B@AAAuyt,,,  	=;<<<u	wy $$&&  	A?@@@u
3y'(())C	-0H0O,P,PPP 	9   uid+++  	NLMMMu4r%   c                    ddl m} |t                      n|\  }}	}
|t                      n|\  }}}|	 t          j        | |||          }||dz                                                                            dk    sJ ||dz                                                                            dk    sJ ||
         ||dz            k                                    sJ ||	         ||dz            k                                    sJ n"# t          $ r |rt          d          Y dS w xY wd	S t          | |||
          s|rt          d          dS  || |||||          }t          j        | |	         j        ||	         j        k              }t          j        | |
         j        ||
         j        k              }|s|r|rt          d          dS d	S )a6  
    Tests if all genomic intervals in a bioframe `df` are cataloged and do not
    extend beyond their associated region in the view `view_df`.

    Parameters
    ----------
    df : pandas.DataFrame

    view_df : pandas.DataFrame
        Valid viewframe.

    raise_errors : bool
        If True, raises errors instead of returning a boolean False for invalid
        properties. Default False.

    df_view_col:
        Column from df used to associate interviews with view regions.
        Default `view_region`.

    view_name_col:
        Column from view_df with view region names. Default `name`.

    cols: (str, str, str)
        Column names for chrom, start, end in df.
    cols_view: (str, str, str)
        Column names for chrom, start, end in view_df.

    Returns
    -------
    is_contained:bool

    r   )trimN)cols1cols2_r   zdf not contained in view_dfFT)r0   r1   zdf not cataloged in view_df)r/   r0   r1   r   	cols_view)r   r?   r   overlapr;   r   r   AssertionErrorr   r   r6   r   r,   )r   r/   r   r0   r1   r   rC   r?   r    r!   r"   ck2sk2ek2df_view_assigneddf_trimis_start_trimmedis_end_trimmeds                     r$   r   r     sN   R /3|)+++MCc/8/@)+++iMCc	"{2wd)TTT$S3Y/4466;;==BBBB$S3Y/4466;;==BBBB$S)-=cCi-HHMMOOOOO %S)-=cCi-HHMMOOOOOO 	 	 	 $%BCCCuu		
 t
GM     	<:;;;ud
#  G vbgn0CCDDVBsGNgcl.AABBN >  	<:;;;uts   CD D*)D*c                 @    ddl m}  || ||||          j        rdS dS )a  
    Tests if a view `view_df` is covered by the set of genomic intervals in
    the bedframe `df`.

    This test is true if ``complement(df,view_df)`` is empty. Also note this
    test ignores regions assigned to intervals in `df` since regions are
    re-assigned in :func:`bioframe.ops.complement`.

    Parameters
    ----------
    df : pandas.DataFrame

    view_df : pandas.DataFrame
        Valid viewFrame.

    view_name_col:
        Column from view_df with view region names. Default `name`.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    cols_view: (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals in view_df, provided separately for
        each set. The default
        values are 'chrom', 'start', 'end'.

    Returns
    -------
    is_covering:bool

    r   )
complement)r/   r1   r   rC   TF)r   rN   empty)r   r/   r1   r   rC   rN   s         r$   r   r   [  sP    F !     z
#   	 tur%   c                    t          j        |||          }t          | |          r|rt          d          dS t	          | ||||          s|rt          d          dS t          | |||||          s|rt          d          dS d	S )
a  
    Tests if a view `view_df` is tiled by the set of genomic intervals in the
    bedframe `df`.

    This is true if:

    - df is not overlapping
    - df is covering view_df
    - df is contained in view_df

    Parameters
    ----------
    df : pandas.DataFrame

    view_df : pandas.DataFrame
        valid viewFrame

    raise_errors : bool
        If True, raises errors instead of returning a boolean False for invalid
        properties. Default False.

    df_view_col: str
        Name of column from df that indicates region in view.

    view_name_col: str
        Name of column from view that specifies unique region name.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.
    cols_view: (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals in view_df, provided
        separately for each set. The default
        values are 'chrom', 'start', 'end'.

    Returns
    -------
    is_tiling:bool

    )r1   r   r5   overlapsF)r1   r   rC   znot covered)r0   r1   r   rC   znot containedT)r   make_viewframer   r   r   r   )r   r/   r   r0   r1   r   rC   s          r$   r   r     s    h )}9  G bt$$$  	)Z(((u
G=ty     	,]+++u
#   
  	._---u4r%   Tc           	          ddl m}  ||                                 ||||||          }|                     |          rdS dS )a>  
    Tests if a bedframe is changed by sorting.

    Also see :func:`bioframe.ops.sort_bedframe`.

    Parameters
    ----------
    df : pandas.DataFrame

    view_df : pandas.DataFrame | dict-like
        Optional view to pass to ``sort_bedframe``.
        When it is dict-like :func:'bioframe.make_viewframe' will
        be used to convert to viewframe. If view_df is not provided
        df is assumed to be sorted by chrom and start.

    reset_index : bool
        Optional argument to pass to ``sort_bedframe``.

    df_view_col: None | str
        Name of column from df that indicates region in view.
        If None, :func:'bioframe.assign_view' will be used to assign view
        regions. Default None.

    view_name_col: str
        Name of column from view that specifies unique region name.

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    cols_view: (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals in view_df, provided separately for each set.
        The default
        values are 'chrom', 'start', 'end'.

    Returns
    -------
    is_sorted : bool

    r   )sort_bedframe)r/   reset_indexr0   r1   r   rC   TF)r   rT   r*   equals)	r   r/   rU   r0   r1   r   rC   rT   	df_sorteds	            r$   r   r     sj    f $#####
		#  I 
yy tur%   )FN)Fr&   r'   )N)Fr'   N)FNr'   NN)r'   NN)Fr&   r'   NN)NTNr'   NN)numpyr6   pandasr    r   r   specsr   r   r	   __all__r
   r   r   r   r   r   r   r    r%   r$   <module>r^      s|                       P P P P P P P P P P	 	 	 	C C C CN OU9 9 9 9x       FE E E EV 	W W W Wt. . . .h 	M M M Md 	B B B B B Br%   