
    tfk+                    R   d dl mZ d dlZd dlZd dlmZ d dlmZ d dlZ	d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZmZ d dlmZ 	 	 	 	 	 	 	 	 	 ddZd Z  ejB                  d      eddddddddde	jD                  ddfd              Z"	 ddZ#d Z$d Z%y)    )annotationsN)partial)zip_longest)
open_files)compute)
read_bytes)flatten)PANDAS_GE_200PANDAS_VERSION)dataframe_creation_dispatch)from_delayed)insert_meta_param_description	make_meta)delayedrecordsutf-8strictc           	     n   ||dk(  }|dk7  r|rt        d      ||d<   |xr |dk(  |d<   t        |df|||
| j                  |d|xs i }t        || j	                               D cg c]  \  }} t        t              |||       }}}|r |	
t               }	t        t        |i |	      S |S c c}}w )a  Write dataframe into JSON text files

    This utilises ``pandas.DataFrame.to_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    produces the kind of JSON output that is most common in big-data
    applications, and which can be chunked when reading (see ``read_json()``).

    Parameters
    ----------
    df: dask.DataFrame
        Data to save
    url_path: str, list of str
        Location to write to. If a string, and there are more than one
        partitions in df, should include a glob character to expand into a
        set of file names, or provide a ``name_function=`` parameter.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    compute: bool
        If true, immediately executes. If False, returns a set of delayed
        objects, which can be computed at a later time.
    compute_kwargs : dict, optional
        Options to be passed in to the compute method
    compression : string or None
        String like 'gzip' or 'xz'.
    name_function : callable, default None
        Function accepting an integer (partition index) and producing a
        string to replace the asterisk in the given filename globstring.
        Should preserve the lexicographic order of partitions.
    r   <Line-delimited JSON is only available with orient="records".orientlineswt)encodingerrorsname_functionnumcompression)

ValueErrorr   npartitionszip
to_delayedr   write_json_partitiondictlistdask_compute)dfurl_pathr   r   storage_optionsr   r   r   r   compute_kwargsr   kwargsoutfilesoutfiledpartss                   `/var/www/html/software/conda/envs/higlass/lib/python3.12/site-packages/dask/dataframe/io/json.pyto_jsonr0      s    h })#uWXXF83) 3F7O	 #NN	  b	H h8GQ 	&$%a&9E  !!VNL%:>:;;s   )!B1c                    |5 } | j                   |fi | d d d        t        j                  j                  |j                        S # 1 sw Y   2xY wN)r0   ospathnormpath)r&   openfiler*   fs       r/   r"   r"   e   sH    	  Q

1 77HMM**   s   AApandasi   inferFc                   ||dk(  }|dk7  r|rt        d      |r|dk7  s|st        d      |xs i }|du rd}d t        |
t              r=t        st        dt        t               d	      t        t        j                  |

      }
|rt        | df||||d|}|rN|\  }}} |d         }t        j                  fd|D              }t        fdt        ||      D              }n|\  }}d}d}d}t        |      }|	t        ||||
||||      }	t        |	      }	t        ||      D cg c]#  \  }} t        t              ||||
|||||		      % }}}nht!        | df|||d|}t        j                  fd|D              }|D cg c].  } t        t"              ||||
| |j$                        ||      0 }}t'        ||	      S c c}}w c c}w )a  Create a dataframe from a set of JSON files

    This utilises ``pandas.read_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    is appropriate for line-delimited "JSON-lines" data, the kind of JSON output
    that is most common in big-data scenarios, and which can be chunked when
    reading (see ``read_json()``). All other options require blocksize=None,
    i.e., one partition per input file.

    Parameters
    ----------
    url_path: str, list of str
        Location to read from. If a string, can include a glob character to
        find a set of file names.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    blocksize: None or int
        If None, files are not blocked, and you get one partition per input
        file. If int, which can only be used for line-delimited JSON files,
        each partition will be approximately this size in bytes, to the nearest
        newline character.
    sample: int
        Number of bytes to pre-load, to provide an empty dataframe structure
        to any blocks without data. Only relevant when using blocksize.
    encoding, errors:
        Text conversion, ``see bytes.decode()``
    compression : string or None
        String like 'gzip' or 'xz'.
    engine : callable or str, default ``pd.read_json``
        The underlying function that dask will use to read JSON files. By
        default, this will be the pandas JSON reader (``pd.read_json``).
        If a string is specified, this value will be passed under the ``engine``
        key-word argument to ``pd.read_json`` (only supported for pandas>=2.0).
    include_path_column : bool or str, optional
        Include a column with the file path where each row in the dataframe
        originated. If ``True``, a new column is added to the dataframe called
        ``path``. If ``str``, sets new column name. Default is ``False``.
    path_converter : function or None, optional
        A function that takes one argument and returns a string. Used to convert
        paths in the ``path`` column, for instance, to strip a common prefix from
        all the paths.
    $META

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_json('myfile.1.json')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_json('myfile.*.json')  # doctest: +SKIP

    >>> dd.read_json(['myfile.1.json', 'myfile.2.json'])  # doctest: +SKIP

    Load large line-delimited JSON files using partitions of approx
    256MB size

    >> dd.read_json('data/file*.csv', blocksize=2**28)
    Nr   r   zSJSON file chunking only allowed for JSON-linesinput (orient='records', lines=True).Tr4   c                    | S r2    )xs    r/   <lambda>zread_json.<locals>.<lambda>   s    1     zYPandas>=2.0 is required to pass a string to the `engine` argument of `read_json` (pandas=z is currently installed).)engine   
)	blocksizesampler   include_pathr   c              3  .   K   | ]  } |        y wr2   r<   ).0ppath_converters     r/   	<genexpr>zread_json.<locals>.<genexpr>   s     ,N1^A->,Ns   c              3  N   K   | ]  \  }} |      gt        |      z    y wr2   )len)rF   rG   chunkrH   s      r/   rI   zread_json.<locals>.<genexpr>   s+      !5=Q"#c%j0!s   "%r2   )metart)r   r   r   c              3  B   K   | ]  } |j                           y wr2   )r4   )rF   r7   rH   s     r/   rI   zread_json.<locals>.<genexpr>  s     (OA)?(Os   )r   
isinstancestrr
   r   r   pd	read_jsonr   CategoricalDtyper	   r    read_json_chunkr   r   r   r   read_json_filer4   r   )r'   r   r   r(   rB   rC   r   r   r   rM   r@   include_path_columnrH   r*   b_outfirstchunkspaths
first_path
path_dtype
flat_pathsflat_chunksrL   r4   r.   filesr7   s               `              r/   rS   rS   k   s   v })#uWXXf	)4
 	
 &+Od"$$ &#~.//HJ 
 f5
  #,
 
 #( E65'a1J,,,N,NNJ  !ADUFAS! J "ME6J JJfo<"#	D   +;
C
 t %GO$#

 
 
 #
 
 (((O(OO
 
  $GN##qvv&	
 
 D))M
0
s   5(G3Gc	                    t        j                  | j                  ||            }	|	j                  d        ||	fddd|}
||
j                  r|S |rt        |
|||      }
|
S )Nr   r   Tr   r   )ioStringIOdecodeseekemptyadd_path_column)rL   r   r   r@   column_namer4   r]   r*   rM   sr&   s              r/   rU   rU   -  sg     	ELL623AFF1I		:)4	:6	:BBHHRdJ?Ir?   c                l    | 5 } ||f||d|}	d d d        |rt        	|||      }		S # 1 sw Y   xY w)Nrb   )rh   )
r7   r   r   r@   ri   r4   r]   r*   	open_filer&   s
             r/   rV   rV   <  sN    	
 EiIDfEDVDERdJ?I	E Es   *3c                    || j                   v rt        d| d       | j                  di |t        j                  |gt        |       z  |      iS )Nz(Files already contain the column name: 'z^', so the path column cannot use this name. Please set `include_path_column` to a unique name.)dtyper<   )columnsr   assignrR   SeriesrK   )r&   ri   r4   rn   s       r/   rh   rh   D  s^    bjj 6{m D 
 	

 299ORYYvB/?u%MNOOr?   )	r   NNTr   r   NNNr2   )&
__future__r   rc   r3   	functoolsr   	itertoolsr   r8   rR   fsspec.corer   	dask.baser   r%   
dask.bytesr   	dask.corer	   dask.dataframe._compatr
   r   dask.dataframe.backendsr   dask.dataframe.io.ior   dask.dataframe.utilsr   r   dask.delayedr   r0   r"   register_inplacerS   rU   rV   rh   r<   r?   r/   <module>r      s    " 	 	  !  " - !  @ ? - I   
M`+ .--h7 
	<<}*  8}*B RVPr?   