From 485be626f366e2d9d3c9df8092b2e12cf56686f8 Mon Sep 17 00:00:00 2001 From: Leonard Date: Thu, 19 Oct 2023 17:08:36 +0200 Subject: [PATCH 1/3] Inital version for sentence complete clf --- classifiers/__init__.py | 4 +- .../sentence_complete_classifier/README.md | 1 + .../sentence_complete_classifier/__init__.py | 32 +++++++++++ .../code_snippet_common.md | 57 +++++++++++++++++++ .../code_snippet_refinery.md | 17 ++++++ .../sentence_complete_classifier/config.py | 38 +++++++++++++ 6 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 classifiers/reference_quality/sentence_complete_classifier/README.md create mode 100644 classifiers/reference_quality/sentence_complete_classifier/__init__.py create mode 100644 classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md create mode 100644 classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md create mode 100644 classifiers/reference_quality/sentence_complete_classifier/config.py diff --git a/classifiers/__init__.py b/classifiers/__init__.py index 658ec826..a2f1cb09 100644 --- a/classifiers/__init__.py +++ b/classifiers/__init__.py @@ -25,6 +25,7 @@ from .reference_quality import ( word_count_classifier, special_character_classifier, + sentence_complete_classifier ) from .dates_and_times import ( @@ -75,7 +76,8 @@ special_character_classifier, chunked_sentence_complexity, question_type_classifier, - communication_style_classifier + communication_style_classifier, + sentence_complete_classifier ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/classifiers/reference_quality/sentence_complete_classifier/README.md b/classifiers/reference_quality/sentence_complete_classifier/README.md new file mode 100644 index 00000000..d527e078 --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/README.md @@ -0,0 +1 @@ +Languages can be very dynamic and complicated. This brick does not actually try to be able to accurately classify all sentences, which would be quite complex. Instead, this brick is meant to check if some characteristics apply that a lot of complete sentences have. These characteristics being: does the sentence starts with an uppercase character, if it ends on a punctuation and if it contains at least two nouns and a verb. The name `starts_with_uppercase_ends_with_punctuation_and_contains_two_nouns_and_a_verb` would be a bit long for a brick, though. \ No newline at end of file diff --git a/classifiers/reference_quality/sentence_complete_classifier/__init__.py b/classifiers/reference_quality/sentence_complete_classifier/__init__.py new file mode 100644 index 00000000..0f19b798 --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/__init__.py @@ -0,0 +1,32 @@ +from pydantic import BaseModel +from extractors.util.spacy import SpacySingleton + +INPUT_EXAMPLE = { + "text": "it would be sad if", + "spacy_model": "en_core_web_sm" +} + +class SentenceCompleteClassifierModel(BaseModel): + text: str + spacy_model: str + + class Config: + schema_extra = {"example": INPUT_EXAMPLE} + +def sentence_complete_classifier(req: SentenceCompleteClassifierModel): + """Classify weather or not a text is complete""" + nlp = SpacySingleton.get_nlp(req.spacy_model) + doc = nlp(req.text) + + for sent in doc.sents: + if sent[0].is_title and sent[-1].is_punct: + has_noun = 2 + has_verb = 1 + for token in sent: + if token.pos_ in ["NOUN", "PROPN", "PRON"]: + has_noun -= 1 + elif token.pos_ == "VERB": + has_verb -= 1 + if has_noun < 1 and has_verb < 1: + return {"sentence": "complete"} + return {"sentence": "incomplete"} \ No newline at end of file diff --git a/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md new file mode 100644 index 00000000..e2e399a8 --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md @@ -0,0 +1,57 @@ +```python +import spacy + +loaded_models = {} +def load_spacy(spacy_model): + if spacy_model not in loaded_models: + loaded_models[spacy_model] = spacy.load(spacy_model) + return loaded_models[spacy_model] + +def sentence_complete_classifier(text: str, spacy_model: str = "en_core_web_sm") -> str: + """ + @param text: The text to classify + @param spacy_model: A spaCy language model + @returns: Classification for the text based on all sentences + """ + nlp = load_spacy(spacy_model) + doc = nlp(text) + classifications = [] + + for sent in doc.sents: + if sent[0].is_title and sent[-1].is_punct: + has_noun = 2 + has_verb = 1 + for token in sent: + if token.pos_ in ["NOUN", "PROPN", "PRON"]: + has_noun -= 1 + elif token.pos_ == "VERB": + has_verb -= 1 + if has_noun < 1 and has_verb < 1: + classifications.append("complete") + else: + classifications.append("incomplete") + else: + classifications.append("incomplete") + + # Aggregation logic + if all(classification == "complete" for classification in classifications): + return "complete" + elif any(classification == "incomplete" for classification in classifications): + return "partly complete" + + +# ↑ necessary bricks function +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +def example_integration(): + texts = [ + "This is a complete sentence written by me!", + "The first sentence I have written is complete! However, the second one... + "and they rand over here and then" + ] + for text in texts: + print(f"The text '{text}' is -> {sentence_complete_classifier(text)}") + +example_integration() +``` \ No newline at end of file diff --git a/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md new file mode 100644 index 00000000..e79f2bc7 --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md @@ -0,0 +1,17 @@ +```python +ATTRIBUTE: str = "text" + +def sentence_complete_classifier(record): + for sent in record[ATTRIBUTE].sents: + if sent[0].is_title and sent[-1].is_punct: + has_noun = 2 + has_verb = 1 + for token in sent: + if token.pos_ in ["NOUN", "PROPN", "PRON"]: + has_noun -= 1 + elif token.pos_ == "VERB": + has_verb -= 1 + if has_noun < 1 and has_verb < 1: + return "complete" + return "incomplete" +``` \ No newline at end of file diff --git a/classifiers/reference_quality/sentence_complete_classifier/config.py b/classifiers/reference_quality/sentence_complete_classifier/config.py new file mode 100644 index 00000000..7b8b78c7 --- /dev/null +++ b/classifiers/reference_quality/sentence_complete_classifier/config.py @@ -0,0 +1,38 @@ +from util.configs import build_classifier_function_config +from util.enums import State, BricksVariableType, RefineryDataType, SelectionType +from . import sentence_complete_classifier, INPUT_EXAMPLE + + +def get_config(): + return build_classifier_function_config( + # strapi information + function=sentence_complete_classifier, + input_example=INPUT_EXAMPLE, + issue_id=349, + tabler_icon="LanguageKatakana", + min_refinery_version="1.7.0", + state=State.PUBLIC.value, + type="python_function", + available_for=["refinery", "common"], + part_of_group=[ + "reference_quality", + ], # first entry should be parent directory + # bricks integrator information + cognition_init_mapping = { + "incomplete": "Needs fix", + "complete": "null" + }, + integrator_inputs={ + "name": "sentence_complete_classifier", + "refineryDataType": RefineryDataType.TEXT.value, + "variables": { + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value + ] + }, + } + } + ) From 935a27f20e08c0d44ba3b7bdc43892558cf9d263 Mon Sep 17 00:00:00 2001 From: Leonard Date: Thu, 19 Oct 2023 19:39:33 +0200 Subject: [PATCH 2/3] Added some aggregation logic --- .../sentence_complete_classifier/__init__.py | 18 +++++++++++++++--- .../code_snippet_common.md | 6 ++++-- .../code_snippet_refinery.md | 16 ++++++++++++++-- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/classifiers/reference_quality/sentence_complete_classifier/__init__.py b/classifiers/reference_quality/sentence_complete_classifier/__init__.py index 0f19b798..a291c58a 100644 --- a/classifiers/reference_quality/sentence_complete_classifier/__init__.py +++ b/classifiers/reference_quality/sentence_complete_classifier/__init__.py @@ -14,10 +14,11 @@ class Config: schema_extra = {"example": INPUT_EXAMPLE} def sentence_complete_classifier(req: SentenceCompleteClassifierModel): - """Classify weather or not a text is complete""" + """Classify wether or not a text is complete""" nlp = SpacySingleton.get_nlp(req.spacy_model) doc = nlp(req.text) + classifications = [] for sent in doc.sents: if sent[0].is_title and sent[-1].is_punct: has_noun = 2 @@ -28,5 +29,16 @@ def sentence_complete_classifier(req: SentenceCompleteClassifierModel): elif token.pos_ == "VERB": has_verb -= 1 if has_noun < 1 and has_verb < 1: - return {"sentence": "complete"} - return {"sentence": "incomplete"} \ No newline at end of file + classifications.append("complete") + else: + classifications.append("incomplete") + else: + classifications.append("incomplete") + + # Aggregation logic + if all(classification == "complete" for classification in classifications): + return {"text_completeness": "complete"} + elif all(classification == "incomplete" for classification in classifications): + return {"text_completeness": "incomplete"} + elif any(classification == "incomplete" for classification in classifications): + return {"text_completeness": "partly complete"} \ No newline at end of file diff --git a/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md index e2e399a8..acaebfeb 100644 --- a/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md +++ b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_common.md @@ -15,8 +15,8 @@ def sentence_complete_classifier(text: str, spacy_model: str = "en_core_web_sm") """ nlp = load_spacy(spacy_model) doc = nlp(text) - classifications = [] + classifications = [] for sent in doc.sents: if sent[0].is_title and sent[-1].is_punct: has_noun = 2 @@ -36,6 +36,8 @@ def sentence_complete_classifier(text: str, spacy_model: str = "en_core_web_sm") # Aggregation logic if all(classification == "complete" for classification in classifications): return "complete" + elif all(classification == "incomplete" for classification in classifications): + return "incomplete" elif any(classification == "incomplete" for classification in classifications): return "partly complete" @@ -47,7 +49,7 @@ def sentence_complete_classifier(text: str, spacy_model: str = "en_core_web_sm") def example_integration(): texts = [ "This is a complete sentence written by me!", - "The first sentence I have written is complete! However, the second one... + "The first sentence I have written is complete! However, the second one...", "and they rand over here and then" ] for text in texts: diff --git a/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md index e79f2bc7..92017a0c 100644 --- a/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md +++ b/classifiers/reference_quality/sentence_complete_classifier/code_snippet_refinery.md @@ -2,6 +2,7 @@ ATTRIBUTE: str = "text" def sentence_complete_classifier(record): + classifications = [] for sent in record[ATTRIBUTE].sents: if sent[0].is_title and sent[-1].is_punct: has_noun = 2 @@ -12,6 +13,17 @@ def sentence_complete_classifier(record): elif token.pos_ == "VERB": has_verb -= 1 if has_noun < 1 and has_verb < 1: - return "complete" - return "incomplete" + classifications.append("complete") + else: + classifications.append("incomplete") + else: + classifications.append("incomplete") + + # Aggregation logic + if all(classification == "complete" for classification in classifications): + return "complete" + elif all(classification == "incomplete" for classification in classifications): + return "incomplete" + elif any(classification == "incomplete" for classification in classifications): + return "partly complete" ``` \ No newline at end of file From 511c6c50b733109c8b5f28702a4661c5e87b92f6 Mon Sep 17 00:00:00 2001 From: Leonard Date: Thu, 19 Oct 2023 19:45:38 +0200 Subject: [PATCH 3/3] Fixed typo --- .../reference_quality/sentence_complete_classifier/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/classifiers/reference_quality/sentence_complete_classifier/__init__.py b/classifiers/reference_quality/sentence_complete_classifier/__init__.py index a291c58a..81ed677c 100644 --- a/classifiers/reference_quality/sentence_complete_classifier/__init__.py +++ b/classifiers/reference_quality/sentence_complete_classifier/__init__.py @@ -14,7 +14,7 @@ class Config: schema_extra = {"example": INPUT_EXAMPLE} def sentence_complete_classifier(req: SentenceCompleteClassifierModel): - """Classify wether or not a text is complete""" + """Classify whether or not a text is complete""" nlp = SpacySingleton.get_nlp(req.spacy_model) doc = nlp(req.text)