
    ɯei4S                     <   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
d dl	mZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ d d	lmZmZ d d
lmZmZ d dlm Z m!Z!m"Z"m#Z$m%Z%m&Z&m'Z' e jP                  dk  rd dlm)Z) nd dl*m)Z) d dl+m,Z,  e,e-      Z.dZ/ G d d      Z0y)    N)reduce)CallableDictListOptionalUnion)Column)SampleBy)build_expr_from_python_val+build_expr_from_snowpark_column_or_col_namewith_src_positionDATAFRAME_AST_PARAMETER)SnowparkClientExceptionMessages)ResourceUsageCollectoradd_api_calladjust_api_subcalls)ColumnOrNameLiteralType)	publicapiwarning)_to_col_if_strapprox_percentile_accumulateapprox_percentile_estimatecorrcountcount_distinct
covar_samp)   	   )Iterable)	getLoggeri  c                      e Zd ZdZ	 	 	 	 ddZeddddeeee   f   d	ee	   d
e
eeef      dedeee	   eee	      f   f
d       Zeddddededed
e
eeef      de
e	   f
d       Zeddddededed
e
eeef      de
e	   f
d       Zeddddededed
e
eeef      ddf
d       Zdedeee	f   deddfdZ	 ddedeee	f   deddfdZe	 	 ddedeee	f   de
e   deddf
d       ZeZeZy)DataFrameStatFunctionszProvides computed statistical functions for DataFrames.
    To access an object of this class, use :attr:`DataFrame.stat`.
    snowflake.snowpark.DataFramereturnNc                     || _         y )N)
_dataframe)self	dataframes     m/var/www/html/glpi_dashboard/venv/lib/python3.12/site-packages/snowflake/snowpark/dataframe_stat_functions.py__init__zDataFrameStatFunctions.__init__9   s     $    T)statement_params	_emit_astcol
percentiler-   r.   c          
      "   |r|sg S i }|r| j                   j                  j                  j                         }t	        |j
                  j                  |      }| j                   j                  |j                         t        |t              rWt        |t              sGd|j                  _        |D ]0  }t        |j                  j                  j!                         |       2 n?d|j                  _        t        |j                  j                  j!                         |       t        |t              st#        dt%        |       d      |j&                  j)                  |       |@|j+                         D ]-  \  }	}
|j,                  j!                         }|	|_        |
|_        / | j                   j                  j                  j3                  |       | j                   j                  j                  j5                  |      \  }|t6        <   d}t        |t8        t        f      r| j                   j;                  t=        |      j?                  |      d      j;                  |D cg c]  }tA        ||       c}d      }tC        |dd	        |jD                  dd
|i|}tG        |d         S t        |tF        tH        f      rtK        |      D cg c]$  \  }}t=        |      j?                  | d|       & }}}tM        tO        |            D cg c]  }|D ]  }tA        | d| |        }}}tO        |      tO        |      z  }| j                   j;                  |d      j;                  |d      }tC        |dd	        |jD                  dd
|i|}tM        tO        |            D cg c]!  }|d   ||z  |dz   |z   D cg c]  }| c}# c}}S tQ        d      c c}w c c}}w c c}}w c c}w c c}}w )at  For a specified numeric column and a list of desired quantiles, returns an approximate value for the column at each of the desired quantiles.
        This function uses the t-Digest algorithm.

        Examples::

            >>> df = session.create_dataframe([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], schema=["a"])
            >>> df.stat.approx_quantile("a", [0, 0.1, 0.4, 0.6, 1])  # doctest: +SKIP

            >>> df2 = session.create_dataframe([[0.1, 0.5], [0.2, 0.6], [0.3, 0.7]], schema=["a", "b"])
            >>> df2.stat.approx_quantile(["a", "b"], [0, 0.1, 0.6])  # doctest: +SKIP

        Args:
            col: The name of the numeric column.
            percentile: A list of float values greater than or equal to 0.0 and less than 1.0.
            statement_params: Dictionary of statement level parameters to be set while executing this action.

        Returns:
             A list of approximate percentile values if ``col`` is a single column name, or a matrix
             with the dimensions ``(len(col) * len(percentile)`` containing the
             approximate percentile values if ``col`` is a list of column names.
        FTzpercentile is of type z, but expected Iterable.tr.   z&DataFrameStatFunctions.approx_quantile   len_subcallsr-   r   _   z@'col' must be a column name, a column object, or a list of them. ))r'   _session
_ast_batchbindr   exprdataframe_stat_approx_quantile_set_ast_refdf
isinstancer    strcolsvariadicr   argsadd
ValueErrortyper0   extenditemsr-   _1_2evalflushr   r	   selectr   as_r   r   _internal_collect_with_taglisttuple	enumeraterangelen	TypeError)r(   r/   r0   r-   r.   kwargsstmtr=   ckvr2   r7   temp_col_namepr@   resicol_iaccumate_colsoutput_colspercentile_lenjxs                           r*   approx_quantilez&DataFrameStatFunctions.approx_quantile?   s   > I??++66;;=D$TYY%M%MtTDOO((1#x(C1E%*		" YA?		@R@R@TVWXY &*		";DIINN<N<N<PRUV j(3 ,T*-=,>>VW  OO"":.+,224 DAq--113AADAD
 OO$$//44T: ((3399$?./ cFC=)'',S155mDPU ( fGQR!+M1=R     <1 0"// !15;C A<dE]+ !*#Au -U377=/1#8NOM  s=12#  +m_AaS+A1EEK 
 !-]1CCN'''GNNu O B  <1 0"// !15;C
 s=12  F1~#5Q.8PQRqR 
 R G S  Ss*   ,O5)O: P  P	P!PP)r.   r-   col1col2c                N   i }|rI| j                   j                  j                  j                         }t	        |j
                  j                  |      }| j                   j                  |j                         t        |j                  |       t        |j                  |       |@|j                         D ]-  \  }}	|j                  j                         }
||
_        |	|
_        / | j                   j                  j                  j#                  |       | j                   j                  j                  j%                  |      \  }|t&        <   | j                   j)                  t+        ||      d      }t-        |dd        |j.                  d	d|i|}|d   |d   d   S dS )
a0  Calculates the correlation coefficient for non-null pairs in two numeric columns.

        Example::

            >>> df = session.create_dataframe([[0.1, 0.5], [0.2, 0.6], [0.3, 0.7]], schema=["a", "b"])
            >>> df.stat.corr("a", "b")
            0.9999999999999991

        Args:
            col1: The name of the first numeric column to use.
            col2: The name of the second numeric column to use.
            statement_params: Dictionary of statement level parameters to be set while executing this action.

        Return:
            The correlation of the two numeric columns.
            If there is not enough data to generate the correlation, the method returns ``None``.
            statement_params: Dictionary of statement level parameters to be set while executing this action.
        NFr3   DataFrameStatFunctions.corrr8   r5   r-   r   r9   )r'   r:   r;   r<   r   r=   dataframe_stat_corrr?   r@   r   rh   ri   rJ   r-   rF   rK   rL   rM   rN   r   rO   	corr_funcr   rQ   r(   rh   ri   r.   r-   rX   rY   r=   r[   r\   r2   r7   r@   r_   s                 r*   r   zDataFrameStatFunctions.corr   sn   8 ??++66;;=D$TYY%B%BDIDOO((17		4H7		4H+,224 DAq--113AADAD
 OO$$//44T: ((3399$?./ __##IdD$9U#KB =AN+b++X=MXQWXF.s1vay8D8r,   c                N   i }|rI| j                   j                  j                  j                         }t	        |j
                  j                  |      }| j                   j                  |j                         t        |j                  |       t        |j                  |       |@|j                         D ]-  \  }}	|j                  j                         }
||
_        |	|
_        / | j                   j                  j                  j#                  |       | j                   j                  j                  j%                  |      \  }|t&        <   | j                   j)                  t+        ||      d      }t-        |dd        |j.                  d	d|i|}|d   |d   d   S dS )
a  Calculates the sample covariance for non-null pairs in two numeric columns.

        Example::

           >>> df = session.create_dataframe([[0.1, 0.5], [0.2, 0.6], [0.3, 0.7]], schema=["a", "b"])
           >>> df.stat.cov("a", "b")
           0.010000000000000037

        Args:
            col1: The name of the first numeric column to use.
            col2: The name of the second numeric column to use.
            statement_params: Dictionary of statement level parameters to be set while executing this action.

        Return:
            The sample covariance of the two numeric columns.
            If there is not enough data to generate the covariance, the method returns None.
        NFr3   rk   r8   r5   r-   r   r9   )r'   r:   r;   r<   r   r=   dataframe_stat_covr?   r@   r   rh   ri   rJ   r-   rF   rK   rL   rM   rN   r   rO   r   r   rQ   rn   s                 r*   covzDataFrameStatFunctions.cov   sn   6 ??++66;;=D$TYY%A%A4HDOO((17		4H7		4H+,224 DAq--113AADAD
 OO$$//44T: ((3399$?./ __##JtT$:e#LB =AN+b++X=MXQWXF.s1vay8D8r,   c                   d}|r| j                   j                  j                  j                         }t	        |j
                  j                  |      }| j                   j                  |j                         t        |j                  |       t        |j                  |       |@|j                         D ]-  \  }}|j                  j                         }	||	_        ||	_        / | j                   j#                  t%        |      d      j'                  |      d   d   }
|
t(        kD  rt+        j,                  |
t(              | j                   j#                  |d      j/                  d      j'                  |      D cg c]  }|d   	 }}| j                   j#                  ||d      j1                  ||d      j3                  t5        |      d      }t7        |dd       |r|j8                  |_        |S c c}w )	ad  Computes a pair-wise frequency table (a ``contingency table``) for the specified columns.
        The method returns a DataFrame containing this table.

        In the returned contingency table:
            - The first column of each row contains the distinct values of ``col1``.
            - The name of the first column is the name of ``col1``.
            - The rest of the column names are the distinct values of ``col2``.
            - For pairs that have no occurrences, the contingency table contains 0 as the count.

        Note:
            The number of distinct values in ``col2`` should not exceed 1000.

        Example::

            >>> df = session.create_dataframe([(1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), (3, 3)], schema=["key", "value"])
            >>> ct = df.stat.crosstab("key", "value").sort(df["key"])
            >>> ct.show()  # doctest: +SKIP
            ---------------------------------------------------------------------------------------------
            |"KEY"  |"CAST(1 AS NUMBER(38,0))"  |"CAST(2 AS NUMBER(38,0))"  |"CAST(3 AS NUMBER(38,0))"  |
            ---------------------------------------------------------------------------------------------
            |1      |1                          |1                          |0                          |
            |2      |2                          |0                          |1                          |
            |3      |0                          |1                          |1                          |
            ---------------------------------------------------------------------------------------------
            <BLANKLINE>

        Args:
            col1: The name of the first column to use.
            col2: The name of the second column to use.
            statement_params: Dictionary of statement level parameters to be set while executing this action.
        NFr3   )r-   r   zDataFrameStatFunctions.crosstabr   r5   )r'   r:   r;   r<   r   r=   dataframe_stat_cross_tabr?   r@   r   rh   ri   rJ   r-   rF   rK   rL   rO   r   rQ   _MAX_COLUMNS_PER_TABLEr   DF_CROSS_TAB_COUNT_TOO_LARGEdistinctpivotaggr   r   uid_ast_id)r(   rh   ri   r.   r-   rY   r=   r[   r\   r2   	row_countrowcolumn_namesr@   s                 r*   crosstabzDataFrameStatFunctions.crosstab&  s   R ??++66;;=D$TYY%G%GNDOO((17		4H7		4H+,224 DAq--113AADAD OO**4 E + 

$
$6F
$
GKKLN	 --1NN1 
 --de-DXX&''!1 ( 	
 F
 
 OO""4"?U4U7StS. 	
 	B APQRBJ	%
s   G?	fractionsdf_generatorc                 x   t               5 }t        d |j                         D cg c]  \  }} || ||       c}}      }d d d        t        d| j                  j
                  j                  |j
                  j                  j                         j                                |S c c}}w # 1 sw Y   oxY w)Nc                 (    | j                  |d      S NFr3   )	union_all)rf   ys     r*   <lambda>zBDataFrameStatFunctions._sample_by_with_union_all.<locals>.<lambda>  s    Q[[e[< r,   z+DataFrameStatFunctions.sample_by[union_all])precallssubcallsresource_usage)	r   r   rJ   r   r'   _plan	api_callscopyget_resource_usage)r(   r/   r   r   resource_usage_collectorr[   r\   res_dfs           r*   _sample_by_with_union_allz0DataFrameStatFunctions._sample_by_with_union_all|  s     $% 	)A<6?oo6GHdadAq)HF	
 	9__**44\\++0023FFH	
  I	 	s   B0B*B0*B00B9c                 0   t        | j                  j                  |j                  |      }t	               5 }| j                  j
                  r~| j                  j                  }|j                  j                  |j                  j                  ||j                        |j                        }| j                  j                  |      }n| j                  j                  |      }d d d        t        dj                                |S # 1 sw Y   'xY w)N)analyzer)from_r   z.DataFrameStatFunctions.sample_by[percent_rank])r   )r
   r'   r   _expressionr   _select_statementsession	_analyzercreate_select_statementcreate_select_snowflake_plan
_with_planr   r   )	r(   r/   r   r.   sample_by_planr   r   select_stmtr   s	            r*   _sample_by_with_percent_rankz3DataFrameStatFunctions._sample_by_with_percent_rank  s     "$//"7"7)T#% 	D)A00//11%//GG!++HH&1B1B I  %..	 H  33K@33NC	D 	<3FFH	

 %	D 	Ds   B0DDseedc                     d}|rډ j                   j                  j                  j                         }t	        |j
                  j                  |      }t        |j                         |O|j                         D ]<  \  }}|j                  j                         }	t        |	j                  |       ||	_        >  j                   j                  |j                          |s@ j                   j#                  dd      }
t%        |
dd       |r|j&                  |
_        |
S t+        d      \t-         j                   t.        j0                  j2                        r.d	t4        d
t6        f fdfd} j9                  ||      }
njt;        dd        j                   j                  j<                  j?                  d      r jA                  |      }
nfd} j9                  ||      }
|r|j&                  |
_        |
S )a  Returns a DataFrame containing a stratified sample without replacement, based on a ``dict`` that specifies the fraction for each stratum.

        Example::

            >>> df = session.create_dataframe([("Bob", 17), ("Alice", 10), ("Nico", 8), ("Bob", 12)], schema=["name", "age"])
            >>> fractions = {"Bob": 0.5, "Nico": 1.0}
            >>> sample_df = df.stat.sample_by("name", fractions)  # non-deterministic result

        Args:
            col: The name of the column that defines the strata.
            fractions: A ``dict`` that specifies the fraction to use for the sample for each stratum.
                If a stratum is not specified in the ``dict``, the method uses 0 as the fraction.
            seed: Specifies a seed value to make the sampling deterministic. Can be any integer between 0 and 2147483647 inclusive.
                Default value is ``None``. This parameter is only supported for :class:`Table`, and it will be ignored
                if it is specified for :class`DataFrame`.
        Nr   Fr3   z'DataFrameStatFunctions.sample_by[empty]r8   r5   	sample_byr[   r%   c                     j                   j                  j                  j                  | k(  j                  j                   j
                  j                        S )N)$df_aliased_col_name_to_real_col_name)r'   r:   r   binary_operator_extractorr   r   r   )r[   r/   r(   s    r*   equal_condition_strz=DataFrameStatFunctions.sample_by.<locals>.equal_condition_str  sL    //99SSAX**9=9N9N9s9s T  r,   c                     | j                   j                  j                  d| j                   j                   d|dz   d d |       d      S )NzSELECT * FROM z	 SAMPLE (g      Y@z) SEED (z) WHERE Fr3   )r'   r:   sql
table_name)r(   r[   r\   r   r   s      r*   r   z6DataFrameStatFunctions.sample_by.<locals>.df_generator  sn    //33$T__%?%?$@	!e)T\]a\bbjk~  @A  lB  kC  D# 4  r,   )r/   r   r   zstat.sample_byz`seed` argument is ignored on `DataFrame` object. Save this DataFrame to a temporary table to get a `Table` object and specify a seed.use_simplified_query_generation)r/   r   c                 f    | j                   j                  |k(  d      j                  |d      S r   )r'   filtersample)r(   r[   r\   r/   s      r*   r   z6DataFrameStatFunctions.sample_by.<locals>.df_generator  s9    ??11#(e1LSSU T  r,   )!r'   r:   r;   r<   r   r=   dataframe_stat_sample_byr   r/   rJ   r   rF   r   rK   rL   r?   r@   limitr   ry   rz   r   rA   	snowflakesnowparkTabler   rB   r   r   confgetr   )r(   r/   r   r   r.   rY   r=   r[   r\   r2   r   r   r   s   `` `        @r*   r   z DataFrameStatFunctions.sample_by  s   2 ??++66;;=D$TYY%G%GND7#F$%OO- DAq**,A.qttQ7AD
 OO((1__**1*>FAPQ !%MS+.
4??I<N<N<T<T U{ s  339< 4 F $B '',,001RS::si:X
 77y| 8  !XXFNr,   )r)   r$   r%   N)T)NT)__name__
__module____qualname____doc__r+   r   r   r   r    floatr   r   rB   boolr   rg   r   rq   r~   r   r   r   r	   r   intr   approxQuantilesampleByr9   r,   r*   r#   r#   4   s   $1$ 
$  6:q<,!778q UOq
 #4S>2q q 
tE{De--	.q qf  597979 79
 79 #4S>279 
%79 79r  596969 69
 69 #4S>269 
%69 69p  59SS S
 S #4S>2S 
(S Sj U*+ 	
 
(0 	 U*+ 	
 
(6 
 #]] U*+] sm	]
 ] 
(] ]~ %NHr,   r#   )1sys	functoolsr   typingr   r   r   r   r   snowflake.snowparkr   r	   5snowflake.snowpark._internal.analyzer.unary_plan_noder
   &snowflake.snowpark._internal.ast.utilsr   r   r   r   *snowflake.snowpark._internal.error_messager   &snowflake.snowpark._internal.telemetryr   r   r   'snowflake.snowpark._internal.type_utilsr   r   "snowflake.snowpark._internal.utilsr   r   snowflake.snowpark.functionsr   r   r   r   rm   r   r   r   version_infor    collections.abcloggingr!   r   _loggerrt   r#   r9   r,   r*   <module>r      s   
   8 8  % J  W 
 N A   v( 
H
 X Xr,   