
    >ie+                    R   d dl mZ d dlZd dlZd dlmZ d dlmZ d dlZ	d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZmZ d dlmZ 	 	 	 	 	 	 	 	 	 ddZd Z  ej!        d          eddddddddde	j"        ddfd                        Z"	 ddZ#d Z$d Z%dS )    )annotationsN)partial)zip_longest)
open_files)compute)
read_bytes)flatten)PANDAS_GE_200PANDAS_VERSION)dataframe_creation_dispatch)from_delayed)insert_meta_param_description	make_meta)delayedrecordsTutf-8strictc           	     N   ||dk    }|dk    r|rt          d          |d<   |o|dk    d<   t          |df|||
| j        |d|pi }fdt          ||                                           D             }|r)|	t                      }	t          t          |i |	          S |S )	a  Write dataframe into JSON text files

    This utilises ``pandas.DataFrame.to_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    produces the kind of JSON output that is most common in big-data
    applications, and which can be chunked when reading (see ``read_json()``).

    Parameters
    ----------
    df: dask.DataFrame
        Data to save
    url_path: str, list of str
        Location to write to. If a string, and there are more than one
        partitions in df, should include a glob character to expand into a
        set of file names, or provide a ``name_function=`` parameter.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    compute: bool
        If true, immediately executes. If False, returns a set of delayed
        objects, which can be computed at a later time.
    compute_kwargs : dict, optional
        Options to be passed in to the compute method
    compression : string or None
        String like 'gzip' or 'xz'.
    name_function : callable, default None
        Function accepting an integer (partition index) and producing a
        string to replace the asterisk in the given filename globstring.
        Should preserve the lexicographic order of partitions.
    Nr   ;Line-delimited JSON is only available withorient="records".orientlineswt)encodingerrorsname_functionnumcompressionc                T    g | ]$\  }} t          t                    ||          %S  )r   write_json_partition).0outfiledkwargss      6lib/python3.11/site-packages/dask/dataframe/io/json.py
<listcomp>zto_json.<locals>.<listcomp>[   sD       GQ 	&$%%a&99      )
ValueErrorr   npartitionszip
to_delayeddictlistdask_compute)dfurl_pathr   r   storage_optionsr   r   r   r   compute_kwargsr   r$   outfilespartss              `  r%   to_jsonr5      s   h })#uL
 
 	
 F83) 3F7O	 #N	 	  b	 	H   h88  E  !!VVNL%:>::;;;r'   c                    |5 } | j         |fi | d d d            n# 1 swxY w Y   t          j                            |j                  S N)r5   ospathnormpath)r/   openfiler$   fs       r%   r    r    g   s    	  Q
1                             7HM***s   ""pandasi   inferFc           
       	
 dk    dk    rrt          d          |rdk    sst          d          |pi }du rdd t          
t                    rGt          s%t          dt          t                     d	          t          t          j        

          
|rt          | df|||d|}ra|\  }}} |d                   }t          j	        fd|D                       t          fdt          ||          D                       }n|\  }}d}d}dt          |          }	t          |
|          	t          	          	
	fdt          ||          D             }nFt          | df|d|}t          j	        fd|D                       
fd|D             }t!          |	          S )a  Create a dataframe from a set of JSON files

    This utilises ``pandas.read_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    is appropriate for line-delimited "JSON-lines" data, the kind of JSON output
    that is most common in big-data scenarios, and which can be chunked when
    reading (see ``read_json()``). All other options require blocksize=None,
    i.e., one partition per input file.

    Parameters
    ----------
    url_path: str, list of str
        Location to read from. If a string, can include a glob character to
        find a set of file names.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    blocksize: None or int
        If None, files are not blocked, and you get one partition per input
        file. If int, which can only be used for line-delimited JSON files,
        each partition will be approximately this size in bytes, to the nearest
        newline character.
    sample: int
        Number of bytes to pre-load, to provide an empty dataframe structure
        to any blocks without data. Only relevant when using blocksize.
    encoding, errors:
        Text conversion, ``see bytes.decode()``
    compression : string or None
        String like 'gzip' or 'xz'.
    engine : callable or str, default ``pd.read_json``
        The underlying function that dask will use to read JSON files. By
        default, this will be the pandas JSON reader (``pd.read_json``).
        If a string is specified, this value will be passed under the ``engine``
        key-word argument to ``pd.read_json`` (only supported for pandas>=2.0).
    include_path_column : bool or str, optional
        Include a column with the file path where each row in the dataframe
        originated. If ``True``, a new column is added to the dataframe called
        ``path``. If ``str``, sets new column name. Default is ``False``.
    path_converter : function or None, optional
        A function that takes one argument and returns a string. Used to convert
        paths in the ``path`` column, for instance, to strip a common prefix from
        all the paths.
    $META

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_json('myfile.1.json')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_json('myfile.*.json')  # doctest: +SKIP

    >>> dd.read_json(['myfile.1.json', 'myfile.2.json'])  # doctest: +SKIP

    Load large line-delimited JSON files using partitions of approx
    256MB size

    >> dd.read_json('data/file*.csv', blocksize=2**28)
    Nr   r   zSJSON file chunking only allowed for JSON-linesinput (orient='records', lines=True).Tr9   c                    | S r7   r   )xs    r%   <lambda>zread_json.<locals>.<lambda>   s    1 r'   zYPandas>=2.0 is required to pass a string to the `engine` argument of `read_json` (pandas=z is currently installed).)engine   
)	blocksizesampler   include_pathr   c              3  .   K   | ]} |          V  d S r7   r   )r!   ppath_converters     r%   	<genexpr>zread_json.<locals>.<genexpr>   s-      ,N,N1^^A->->,N,N,N,N,N,Nr'   c              3  V   K   | ]#\  }} |          gt          |          z  V  $d S r7   )len)r!   rI   chunkrJ   s      r%   rK   zread_json.<locals>.<genexpr>   sP       ! !5=Q""#c%jj0! ! ! ! ! !r'   r7   c                b    g | ]+\  }} t          t                    ||	 	  	        ,S )meta)r   read_json_chunk)
r!   rN   r9   r   rC   r   include_path_columnr$   rQ   
path_dtypes
      r%   r&   zread_json.<locals>.<listcomp>  sc     
 
 
 t %GO$$#
 
 

 
 
r'   rt)r   r   r   c              3  8   K   | ]} |j                   V  d S r7   )r9   )r!   r<   rJ   s     r%   rK   zread_json.<locals>.<genexpr>  s/      (O(OA)?)?(O(O(O(O(O(Or'   c                t    g | ]4} t          t                    | |j                            5S r   )r   read_json_filer9   )	r!   r<   rC   rS   r$   r   r   rJ   rT   s	     r%   r&   zread_json.<locals>.<listcomp>   sc     
 
 
  $GN###qv&&	 	
 
 
r'   rP   )r(   
isinstancestrr
   r   r   pd	read_jsonr   CategoricalDtyper	   r*   rR   r   r   r   r   )r0   r   r   r1   rE   rF   r   r   r   rQ   rC   rS   rJ   r$   b_outfirstchunkspaths
first_path
flat_pathsflat_chunksr4   filesrT   s    ``   `` `````         @r%   r\   r\   m   s?   v })#uL
 
 	
  
f	)))4
 
 	
 &+Od""$$ &# 6 	J~..J J J  
 f555 H

  #,
 
 
 
  	#( E65'a11J,,N,N,N,N,N,N,NNNJ  ! ! ! !ADUFASAS! ! !  JJ "ME6J JJfoo<"#	 	D 
 
 
 
 
 
 
 
 
 
  +;
CC
 
 
 
 #
 
 
 
 ((O(O(O(O(O(O(OOO

 
 
 
 
 
 
 
 
 
 
 
 
 D))))r'   c	                    t          j        |                     ||                    }	|	                    d            ||	fddd|}
|	|
j        r|S |rt          |
|||          }
|
S )Nr   r   Tr   r   )ioStringIOdecodeseekemptyadd_path_column)rN   r   r   rC   column_namer9   rT   r$   rQ   sr/   s              r%   rR   rR   1  s     	ELL62233AFF1III		:)4	:	:6	:	:BBH @RdJ??Ir'   c                z    | 5 } ||f||d|}	d d d            n# 1 swxY w Y   |rt          |	|||          }	|	S )Nrg   )rm   )
r<   r   r   rC   rn   r9   rT   r$   	open_filer/   s
             r%   rX   rX   @  s    	
 EiVIDfEDDVDDE E E E E E E E E E E E E E E @RdJ??Is     c                    || j         v rt          d| d           | j        di |t          j        |gt          |           z  |          iS )Nz(Files already contain the column name: 'z^', so the path column cannot use this name. Please set `include_path_column` to a unique name.)dtyper   )columnsr(   assignr[   SeriesrM   )r/   rn   r9   rs   s       r%   rm   rm   H  sr    bj  {   
 
 	

 29OORYvB/?u%M%M%MNOOOr'   )	r   NNTr   r   NNNr7   )&
__future__r   rh   r8   	functoolsr   	itertoolsr   r=   r[   fsspec.corer   	dask.baser   r.   
dask.bytesr   	dask.corer	   dask.dataframe._compatr
   r   dask.dataframe.backendsr   dask.dataframe.io.ior   dask.dataframe.utilsr   r   dask.delayedr   r5   r    register_inplacer\   rR   rX   rm   r   r'   r%   <module>r      s   " " " " " " 				 				       ! ! ! ! ! !     " " " " " " - - - - - - ! ! ! ! ! !       @ @ @ @ @ @ @ @ ? ? ? ? ? ? - - - - - - I I I I I I I I             
O O O Od+ + + .-h77 
	<* * *  87*F RV     P P P P Pr'   