Source code for ynlu.sdk.evaluation.utils

from typing import List, Dict
import re

from ynlu.sdk.evaluation import NOT_ENTITY, UNKNOWN


SUB_PROG = re.compile(r"<[^\<\>]*?>(?P<KEEP>[^\<\>]*?)</[^\<\>]*?>")
FINDALL_PROG = re.compile(r"<([^\<\>]*?)>([^\<\>]*?)</")


[docs]def remove_annotation(annotated_utterance: str):
    return SUB_PROG.sub("\g<KEEP>", annotated_utterance)


[docs]def preprocess_annotated_utterance(
        annotated_utterance: str,
        not_entity: str = NOT_ENTITY,
    ) -> List[str]:
    """Character Level Entity Label Producer

        Named-entity of each character is extracted by XML-like annotation.
        Also, they would be collected in a list conform to the order of characters
        in the sentence.

        Args:
            annotated_utterance (a string):
                An utterance with annotations looks like <a>blabla</a>.
                It is a special format for labeling named-entity in an utterance.
            not_entity (a string, default = "DONT_CARE"):
                A representation of words that we don't care about.

        Returns:
            entities (a list of string):
                A list of named-entity labels in character level.

        Examples:
            >>> from ynlu.sdk.evaluation.utils import preprocess_annotated_utterance
            >>> preprocess_annotated_utterance(
                annotated_utterance="<drink>Coffee</drink>, please.",
                not_entity="n",
            )
            >>> ["drink", "drink", "drink", "drink", "drink", "drink", "n",
                "n", "n", "n", "n", "n", "n", "n", "n"]
    """
    clean_utterance = remove_annotation(annotated_utterance)
    entity_word_pair = FINDALL_PROG.findall(annotated_utterance)
    entities = [not_entity] * len(clean_utterance)
    begin_index = 0
    for entity, word in entity_word_pair:
        start_idx = clean_utterance.find(word, begin_index)
        if start_idx == -1:
            raise ValueError(
                "Word {} can not be found in {}".format(word, clean_utterance),
            )
        entities[start_idx: start_idx + len(word)] = [entity] * len(word)
        begin_index = start_idx + len(word)
    return entities


[docs]def preprocess_entity_prediction(
        utterance: str,
        entity_prediction: List[dict],
        not_entity: str = NOT_ENTITY,
    )-> List[str]:
    """Character Level Entity Label Producer

        Named-entity of each character is extracted by the output of model prediction.
        Also, they would be collected in a list conform to the order of characters
        in the sentence.

        Args:
            utterance (a string):
                An input of `model.predict()`.
            entity_prediction (a list dictionaries):
                Entity part of output returns by calling `model.predict()` with
                the utterance above as input. The element in the list is a segment
                of utterance, the predicted entity type and the confidence of
                prediction.
            not_entity (a string, default = "DONT_CARE"):
                A representation of words that we don't care about.

        Returns:
            entities (a list of string):
                A list of named-entity labels in character level.

        Examples:
            >>> from ynlu.sdk.evaluation.utils import preprocess_annotated_utterance
            >>> preprocess_entity_prediction(
                    utterance="Coffee, please.",
                    entity_prediction=[
                        {"value": "Coffee", "entity": "drink", "score": 0.8},
                        {"value": ", please.", "entity": "n"}, "score": 0.7},
                    ],
                    not_entity="n",
                )
            >>> ["drink", "drink", "drink", "drink", "drink", "drink", "n",
                "n", "n", "n", "n", "n", "n", "n", "n"]
    """
    entities = [not_entity] * len(utterance)
    begin_index = 0
    for pred in entity_prediction:
        if begin_index >= len(utterance):
            break
        for char in pred["value"]:
            start_idx = utterance.find(char, begin_index)
            if start_idx == -1:
                raise ValueError(
                    "Word {} can not be found in {}".format(char, utterance),
                )
            entities[start_idx] = pred["entity"]
            begin_index = start_idx + len(char)
    return entities


[docs]def preprocess_intent_prediction_by_threshold(
        intent_predictions: List[List[Dict[str, str]]],
        threshold: float = 0.5,
        unknown_token: str = UNKNOWN,
    ) -> List[str]:
    """Predicted Intent Overrider

        Override the predicted intent with the unknown token when its score is
        lower than the threshold.

        Args:
            intent_predictions( a list of intent predictions):
                A list of intent part of output by calling `model.predict()`.
            threshold (float, defualt is 0.5):
                The indicator about whether to override the prediciton or not.
            unknown_token (a string, default is UNk):
                A token which would be used as an alternative to a lower-confidence
                intent prediction.

        Returns:
            output (a list of string):
                A list of preprocessed intent labels.

        Examples:
            >>> from ynlu.sdk.evaluation.utils import preprocess_intent_prediction_by_threshold
            >>> preprocess_intent_prediction_by_threshold(
                    intent_predictions=[
                        [{"intent": "a", "score": 0.3}, {"intent": "b", "score": 0.1}],
                        [{"intent": "b", "score": 0.7}, {"intent": "c", "score": 0.3}],
                    ]
                    threshold=0.5,
                    unknown_token="oo",
                )
            >>> [["oo", "oo"], ["b", "oo"]]
    """
    output = []
    for intent_pred in intent_predictions:
        if intent_pred[0]["score"] > threshold:
            output.append(intent_pred[0]["intent"])
        else:
            output.append(unknown_token)
    return output
Source code for ynlu.sdk.evaluation.utils

yoctol-nlu

Navigation

Related Topics