
    de                       d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZ d dlmZ g dZdBdZdCdZeedddddDd            ZeedddddEd"            ZeeddddFd$ZeedddddGd&            ZeedddddHd(            ZeeddddId)Zeeddddd*dJd/            Zeeddddd*dKd1            Zeed2ddd*dLd4Z	 d dlZn# e$ r Y nw xY wdMd8ZdNd9Zeddddd:d;dOdAZ dS )P    )annotationsN)isnan)AnyCallable
CollectionHashableIterableMappingSequenceoverload)
ScorerFlag)WRatioratio)default_process)extractextract_iter
extractOnecdistscorerr   kwargsdict[str, Any]returntuple[int, int]c                h    t          | dd           }| |d         di |}|d         |d         fS dS )N_RF_ScorerPyget_scorer_flagsworst_scoreoptimal_score)r   d    )getattrr   r   paramsflagss       X/home/feoh/.local/pipx/venvs/poetry/lib/python3.11/site-packages/rapidfuzz/process_py.py_get_scorer_flags_pyr&      sN    V^T22F*)*44V44m$eO&<==8    sboolc                Z    | dS t          | t                    rt          |           rdS dS )NTF)
isinstancefloatr   )r(   s    r%   _is_noner-   "   s5    yt!U a t5r'   )r   	processorscore_cutoff
score_hintquerySequence[Hashable] | Nonechoices#Iterable[Sequence[Hashable] | None]Callable[..., int | float]r.   /Callable[..., Sequence[Hashable]] | None | boolr/   int | float | Noner0   5Iterable[tuple[Sequence[Hashable], int | float, int]]c                   d S Nr    r1   r3   r   r.   r/   r0   r   s          r%   r   r   ,   	     Cr'   'Mapping[Any, Sequence[Hashable] | None]5Iterable[tuple[Sequence[Hashable], int | float, Any]]c                   d S r:   r    r;   s          r%   r   r   :   r<   r'   MIterable[Sequence[Hashable] | None] | Mapping[Any, Sequence[Hashable] | None]c             +    K   t          ||          \  }}||k    }	t          |           rdS |du rt          }n|du rd}||}| ||           } t          |d          r|                                nt          |          }
|
D ]X\  }}t          |          r| || |fd|d|}n ||  ||          fd|d|}|	r||k    r|||fV  K||k    r|||fV  YdS )aD  
    Find the best match in a list of choices

    Parameters
    ----------
    query : Sequence[Hashable]
        string we want to find
    choices : Iterable[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
        list of all strings the query should be compared with or dict with a mapping
        {<result>: <string to compare>}
    scorer : Callable, optional
        Optional callable that is used to calculate the matching score between
        the query and each choice. This can be any of the scorers included in RapidFuzz
        (both scorers that calculate the edit distance or the normalized edit distance), or
        a custom function, which returns a normalized edit distance.
        fuzz.WRatio is used by default.
    processor : Callable, optional
        Optional callable that reformats the strings.
        utils.default_process is used by default, which lowercases the strings and trims whitespace
    score_cutoff : Any, optional
        Optional argument for a score threshold. When an edit distance is used this represents the maximum
        edit distance and matches with a `distance <= score_cutoff` are ignored. When a
        normalized edit distance is used this represents the minimal similarity
        and matches with a `similarity >= score_cutoff` are ignored. Default is None, which deactivates this behaviour.
    score_hint : Any, optional
        Optional argument for an expected score to be passed to the scorer.
        This is used to select a faster implementation. Default is None,
        which deactivates this behaviour.
    **kwargs : Any, optional
        any other named parameters are passed to the scorer. This can be used to pass
        e.g. weights to string_metric.levenshtein

    Yields
    -------
    Tuple[Sequence[Hashable], Any, Any]
        Yields similarity between the query and each choice in form of a Tuple with 3 elements.
        The values stored in the tuple depend on the types of the input arguments.

        * The first element is always the current `choice`, which is the value that's compared to the query.

        * The second value represents the similarity calculated by the scorer. This can be:

          * An edit distance (distance is 0 for a perfect match and > 0 for non perfect matches).
            In this case only choices which have a `distance <= max` are yielded.
            An example of a scorer with this behavior is `string_metric.levenshtein`.
          * A normalized edit distance (similarity is a score between 0 and 100, with 100 being a perfect match).
            In this case only choices which have a `similarity >= score_cutoff` are yielded.
            An example of a scorer with this behavior is `string_metric.normalized_levenshtein`.

          Note, that for all scorers, which are not provided by RapidFuzz, only normalized edit distances are supported.

        * The third parameter depends on the type of the `choices` argument it is:

          * The `index of choice` when choices is a simple iterable like a list
          * The `key of choice` when choices is a mapping like a dict, or a pandas Series

    NTFitemsr.   r/   r&   r-   r   hasattrrB   	enumerate)r1   r3   r   r.   r/   r0   r   r   r   lowest_score_worstchoices_iterkeychoicescores                 r%   r   r   H   s     H "6ff!E!EK&4 D#			e			" 	%   '.gw&?&?W7==???YwEWEWL# + +VF 	Fv)-L LR EE F	&!! )	 
  E  	+$$uc****$$uc****-+ +r'   2tuple[Sequence[Hashable], int | float, int] | Nonec                   d S r:   r    r;   s          r%   r   r      r<   r'   2tuple[Sequence[Hashable], int | float, Any] | Nonec                   d S r:   r    r;   s          r%   r   r      r<   r'   c                  t          ||          \  }}||k    }	t          |           rdS |du rt          }n|du rd}||}| ||           } d}
t          |d          r|                                nt          |          }|D ]|\  }}t          |          r| || |fd|d|}n ||  ||          fd|d|}|	r||k    r|
||
d         k    r|}|||f}
n||k    r|
||
d         k     r|}|||f}
||k    r n}|
S )a  
    Find the best match in a list of choices. When multiple elements have the same similarity,
    the first element is returned.

    Parameters
    ----------
    query : Sequence[Hashable]
        string we want to find
    choices : Iterable[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
        list of all strings the query should be compared with or dict with a mapping
        {<result>: <string to compare>}
    scorer : Callable, optional
        Optional callable that is used to calculate the matching score between
        the query and each choice. This can be any of the scorers included in RapidFuzz
        (both scorers that calculate the edit distance or the normalized edit distance), or
        a custom function, which returns a normalized edit distance.
        fuzz.WRatio is used by default.
    processor : Callable, optional
        Optional callable that reformats the strings.
        utils.default_process is used by default, which lowercases the strings and trims whitespace
    score_cutoff : Any, optional
        Optional argument for a score threshold. When an edit distance is used this represents the maximum
        edit distance and matches with a `distance <= score_cutoff` are ignored. When a
        normalized edit distance is used this represents the minimal similarity
        and matches with a `similarity >= score_cutoff` are ignored. Default is None, which deactivates this behaviour.
    score_hint : Any, optional
        Optional argument for an expected score to be passed to the scorer.
        This is used to select a faster implementation. Default is None,
        which deactivates this behaviour.
    **kwargs : Any, optional
        any other named parameters are passed to the scorer. This can be used to pass
        e.g. weights to string_metric.levenshtein

    Returns
    -------
    Tuple[Sequence[Hashable], Any, Any]
        Returns the best match in form of a Tuple with 3 elements. The values stored in the
        tuple depend on the types of the input arguments.

        * The first element is always the `choice`, which is the value that's compared to the query.

        * The second value represents the similarity calculated by the scorer. This can be:

          * An edit distance (distance is 0 for a perfect match and > 0 for non perfect matches).
            In this case only choices which have a `distance <= score_cutoff` are returned.
            An example of a scorer with this behavior is `string_metric.levenshtein`.
          * A normalized edit distance (similarity is a score between 0 and 100, with 100 being a perfect match).
            In this case only choices which have a `similarity >= score_cutoff` are returned.
            An example of a scorer with this behavior is `string_metric.normalized_levenshtein`.

          Note, that for all scorers, which are not provided by RapidFuzz, only normalized edit distances are supported.

        * The third parameter depends on the type of the `choices` argument it is:

          * The `index of choice` when choices is a simple iterable like a list
          * The `key of choice` when choices is a mapping like a dict, or a pandas Series

    None
        When no choice has a `similarity >= score_cutoff`/`distance <= score_cutoff` None is returned

    Examples
    --------

    >>> from rapidfuzz.process import extractOne
    >>> from rapidfuzz.string_metric import levenshtein, normalized_levenshtein
    >>> from rapidfuzz.fuzz import ratio

    extractOne can be used with normalized edit distances.

    >>> extractOne("abcd", ["abce"], scorer=ratio)
    ("abcd", 75.0, 1)
    >>> extractOne("abcd", ["abce"], scorer=normalized_levenshtein)
    ("abcd", 75.0, 1)

    extractOne can be used with edit distances as well.

    >>> extractOne("abcd", ["abce"], scorer=levenshtein)
    ("abce", 1, 0)

    additional settings of the scorer can be passed as keyword arguments to extractOne

    >>> extractOne("abcd", ["abce"], scorer=levenshtein, weights=(1,1,2))
    ("abcde", 2, 1)

    when a mapping is used for the choices the key of the choice is returned instead of the List index

    >>> extractOne("abcd", {"key": "abce"}, scorer=ratio)
    ("abcd", 75.0, "key")

    By default each string is preprocessed using `utils.default_process`, which lowercases the strings,
    replaces non alphanumeric characters with whitespaces and trims whitespaces from start and end of them.
    This behavior can be changed by passing a custom function, or None to disable the behavior. Preprocessing
    can take a significant part of the runtime, so it makes sense to disable it, when it is not required.


    >>> extractOne("abcd", ["abdD"], scorer=ratio)
    ("abcD", 100.0, 0)
    >>> extractOne("abcd", ["abdD"], scorer=ratio, processor=None)
    ("abcD", 75.0, 0)
    >>> extractOne("abcd", ["abdD"], scorer=ratio, processor=lambda s: s.upper())
    ("abcD", 100.0, 0)

    When only results with a similarity above a certain threshold are relevant, the parameter score_cutoff can be
    used to filter out results with a lower similarity. This threshold is used by some of the scorers to exit early,
    when they are sure, that the similarity is below the threshold.
    For normalized edit distances all results with a similarity below score_cutoff are filtered out

    >>> extractOne("abcd", ["abce"], scorer=ratio)
    ("abce", 75.0, 0)
    >>> extractOne("abcd", ["abce"], scorer=ratio, score_cutoff=80)
    None

    For edit distances all results with an edit distance above the score_cutoff are filtered out

    >>> extractOne("abcd", ["abce"], scorer=levenshtein, weights=(1,1,2))
    ("abce", 2, 0)
    >>> extractOne("abcd", ["abce"], scorer=levenshtein, weights=(1,1,2), score_cutoff=1)
    None

    NTFrB   rC      rD   )r1   r3   r   r.   r/   r0   r   r   r   rG   resultrH   rI   rJ   rK   s                  r%   r   r      s   F "6ff!E!EK&4 tD#			e			" 	%  AEF '.gw&?&?W7==???YwEWEWL#  VF 	Fv)-L LR EE F	&!! )	 
  E  	.$$&.EF1I<M<M$ %-$$&.EF1I<M<M$ %-M!!E " Mr'   )r   r.   limitr/   r0   %Collection[Sequence[Hashable] | None]rS   
int | None1list[tuple[Sequence[Hashable], int | float, int]]c                   d S r:   r    r1   r3   r   r.   rS   r/   r0   r   s           r%   r   r     	     Cr'   1list[tuple[Sequence[Hashable], int | float, Any]]c                   d S r:   r    rX   s           r%   r   r     rY   r'      OCollection[Sequence[Hashable] | None] | Mapping[Any, Sequence[Hashable] | None]c                   t          ||          \  }}	|	|k    }
t          | |f|||d|}|t          |d |
          S |
rt          j        ||d           S t          j        ||d           S )a  
    Find the best matches in a list of choices. The list is sorted by the similarity.
    When multiple choices have the same similarity, they are sorted by their index

    Parameters
    ----------
    query : Sequence[Hashable]
        string we want to find
    choices : Collection[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
        list of all strings the query should be compared with or dict with a mapping
        {<result>: <string to compare>}
    scorer : Callable, optional
        Optional callable that is used to calculate the matching score between
        the query and each choice. This can be any of the scorers included in RapidFuzz
        (both scorers that calculate the edit distance or the normalized edit distance), or
        a custom function, which returns a normalized edit distance.
        fuzz.WRatio is used by default.
    processor : Callable, optional
        Optional callable that reformats the strings.
        utils.default_process is used by default, which lowercases the strings and trims whitespace
    limit : int
        maximum amount of results to return
    score_cutoff : Any, optional
        Optional argument for a score threshold. When an edit distance is used this represents the maximum
        edit distance and matches with a `distance <= score_cutoff` are ignored. When a
        normalized edit distance is used this represents the minimal similarity
        and matches with a `similarity >= score_cutoff` are ignored. Default is None, which deactivates this behaviour.
    score_hint : Any, optional
        Optional argument for an expected score to be passed to the scorer.
        This is used to select a faster implementation. Default is None,
        which deactivates this behaviour.
    **kwargs : Any, optional
        any other named parameters are passed to the scorer. This can be used to pass
        e.g. weights to string_metric.levenshtein

    Returns
    -------
    List[Tuple[Sequence[Hashable], Any, Any]]
        The return type is always a List of Tuples with 3 elements. However the values stored in the
        tuple depend on the types of the input arguments.

        * The first element is always the `choice`, which is the value that's compared to the query.

        * The second value represents the similarity calculated by the scorer. This can be:

          * An edit distance (distance is 0 for a perfect match and > 0 for non perfect matches).
            In this case only choices which have a `distance <= max` are returned.
            An example of a scorer with this behavior is `string_metric.levenshtein`.
          * A normalized edit distance (similarity is a score between 0 and 100, with 100 being a perfect match).
            In this case only choices which have a `similarity >= score_cutoff` are returned.
            An example of a scorer with this behavior is `string_metric.normalized_levenshtein`.

          Note, that for all scorers, which are not provided by RapidFuzz, only normalized edit distances are supported.

        * The third parameter depends on the type of the `choices` argument it is:

          * The `index of choice` when choices is a simple iterable like a list
          * The `key of choice` when choices is a mapping like a dict, or a pandas Series

        The list is sorted by `score_cutoff` or `max` depending on the scorer used. The first element in the list
        has the `highest similarity`/`smallest distance`.

    )r.   r   r/   Nc                    | d         S NrQ   r    is    r%   <lambda>zextract.<locals>.<lambda>  s
    1 r'   )rI   reversec                    | d         S r`   r    ra   s    r%   rc   zextract.<locals>.<lambda>  s
    ! r'   )rI   c                    | d         S r`   r    ra   s    r%   rc   zextract.<locals>.<lambda>  s
    QqT r'   )r&   r   sortedheapqnlargest	nsmallest)r1   r3   r   r.   rS   r/   r0   r   r   r   rG   result_iters               r%   r   r     s    V "6ff!E!EK&4 !   K }k~~?QRRRR F~e[nnEEEE?5+>>BBBBr'   dtypenp.dtype | Nonenp.dtypec                    dd l }| | S t          |dd           }|1 |d         di |}|d         t          j        z  r|j        S |j        S |j        S )Nr   r   r   r$   r    )numpyr!   r   
RESULT_I64int32float32)rl   r   r   npr#   r$   s         r%   _dtype_to_type_numru     sx    
 V^T22F*)*44V44>J11 	8Oz:r'   c                v    t          | dd           }|% |d         di |}|d         t          j        z  rdS dS )Nr   r   r$   TFr    )r!   r   	SYMMETRICr"   s       r%   _is_symmetricrx   %  sR    V^T22F*)*44V44>J00 	45r'   rQ   )r   r.   r/   r0   rl   workersqueries(Callable[..., Sequence[Hashable]] | Nonery   int
np.ndarrayc                  ddl }	t          ||fi |} |	j        t          |           t          |          f|          }
| |u rt	          |fi |rt          |           }nfd| D             }t          |          D ]Y\  }} |||fd|d||
||f<   t          |dz   t          |                    D ]!} ||||         fd|d|x|
||f<   |
||f<   "Znkt          |          }nfd|D             }t          |           D ];\  }}r |          n|}t          |          D ]\  }} |||fd|d||
||f<   <|
S )a
  
    Compute distance/similarity between each pair of the two collections of inputs.

    Parameters
    ----------
    queries : Collection[Sequence[Hashable]]
        list of all strings the queries
    choices : Collection[Sequence[Hashable]]
        list of all strings the query should be compared
    scorer : Callable, optional
        Optional callable that is used to calculate the matching score between
        the query and each choice. This can be:

        - a scorer using the RapidFuzz C-API like the builtin scorers in RapidFuzz,
          which can return a distance or similarity between two strings. Further details can be found here.
        - a Python function which returns a similarity between two strings in the range 0-100. This is not
          recommended, since it is far slower than a scorer using the RapidFuzz C-API.

        fuzz.ratio is used by default.
    processor : Callable, optional
        Optional callable that is used to preprocess the strings before
        comparing them. Default is None, which deactivates this behaviour.
    score_cutoff : Any, optional
        Optional argument for a score threshold to be passed to the scorer.
        Default is None, which deactivates this behaviour.
    score_hint : Any, optional
        Optional argument for an expected score to be passed to the scorer.
        This is used to select a faster implementation. Default is None,
        which deactivates this behaviour.
    dtype : data-type, optional
        The desired data-type for the result array.Depending on the scorer type the following
        dtypes are supported:

        - similarity:
          - np.float32, np.float64
          - np.uint8 -> stores fixed point representation of the result scaled to a range 0-100
        - distance:
          - np.int8, np.int16, np.int32, np.int64

        If not given, then the type will be np.float32 for similarities and np.int32 for distances.
    workers : int, optional
        The calculation is subdivided into workers sections and evaluated in parallel.
        Supply -1 to use all available CPU cores.
        This argument is only available for scorers using the RapidFuzz C-API so far, since it
        releases the Python GIL.
    **kwargs : Any, optional
        any other named parameters are passed to the scorer. This can be used to pass
        e.g. weights to string_metric.levenshtein

    Returns
    -------
    ndarray
        Returns a matrix of dtype with the distance/similarity between each pair
        of the two collections of inputs.
    r   N)rl   c                &    g | ]} |          S r    r    .0xr.   s     r%   
<listcomp>zcdist.<locals>.<listcomp>{  !    :::QIIaLL:::r'   rC   rQ   c                &    g | ]} |          S r    r    r   s     r%   r   zcdist.<locals>.<listcomp>  r   r'   )rp   ru   zeroslenrx   listrF   range)rz   r3   r   r.   r/   r0   rl   ry   r   rt   resultsproc_queriesrb   r1   jproc_choices
proc_queryrJ   s      `              r%   r   r   /  s3   F uf7777EbhGc'll35AAAG'mF==f====LL::::':::L!,// 	 	HAu"Fu(,< KQ GAqDM 1q5#l"3"344  06 O1 #!-	1 1
 1 1 11		 ==LL::::':::L!'** 		 		HAu-6A5)))EJ&|44  	6 &! #!-	! !
 ! !1 Nr'   )r   r   r   r   r   r   )r(   r   r   r)   )r1   r2   r3   r4   r   r5   r.   r6   r/   r7   r0   r7   r   r   r   r8   )r1   r2   r3   r=   r   r5   r.   r6   r/   r7   r0   r7   r   r   r   r>   )r1   r2   r3   r@   r   r5   r.   r6   r/   r7   r0   r7   r   r   r   r>   )r1   r2   r3   r4   r   r5   r.   r6   r/   r7   r0   r7   r   r   r   rL   )r1   r2   r3   r=   r   r5   r.   r6   r/   r7   r0   r7   r   r   r   rN   )r1   r2   r3   r@   r   r5   r.   r6   r/   r7   r0   r7   r   r   r   rN   )r1   r2   r3   rT   r   r5   r.   r6   rS   rU   r/   r7   r0   r7   r   r   r   rV   )r1   r2   r3   r=   r   r5   r.   r6   rS   rU   r/   r7   r0   r7   r   r   r   rZ   )r1   r2   r3   r]   r   r5   r.   r6   rS   rU   r/   r7   r0   r7   r   r   r   rZ   )rl   rm   r   r5   r   r   r   rn   )r   r5   r   r   r   r)   )rz   rT   r3   rT   r   r5   r.   r{   r/   r7   r0   r7   rl   rm   ry   r|   r   r   r   r}   )!
__future__r   rh   mathr   typingr   r   r   r   r	   r
   r   r   rapidfuzz._utilsr   rapidfuzz.fuzzr   r   rapidfuzz.utilsr   __all__r&   r-   r   r   r   rp   rt   BaseExceptionru   rx   r   r    r'   r%   <module>r      s   # " " " " "       	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ( ' ' ' ' ' ( ( ( ( ( ( ( ( + + + + + +
<
<
<       

 *0AE'+%)
 
 
 
 
 

 

 *0AE'+%)
 
 
 
 
 

$ *0AP'+%)n+ n+ n+ n+ n+ n+b 

 *0AE'+%)
 
 
 
 
 

 

 *0AE'+%)
 
 
 
 
 

$ *0AP'+%)v v v v v vr 

 *0AE'+%)     
 

 *0AE'+%)     
& *0AP'+%)\C \C \C \C \C \C~	 	 	 	D	   (    */:>'+%)!k k k k k k k ks   C C C 