diff --git a/README.md b/README.md index d7e099fa..7bad8a1d 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ See how easy it is to use any of the **thousands** of models in 1 line of code, ## NLU & Streamlit in Action This 1 line let's you visualize and play with **1000+ SOTA NLU & NLP models** in **200** languages -for **Named Entitiy Recognition**, **Dependency Trees & Parts of Speech**, **Classification for 100+ problems**, **Text Summarization & Question Answering using T5** , **Translation with Marian**, **Text Similarity Matrix** using **BERT, ALBERT, ELMO, XLNET, ELECTRA** with other of the **100+ wordembeddings** and much more using [Streamlit](http://streamlit.com/) . ```shell streamlit run https://raw.githubusercontent.com/JohnSnowLabs/nlu/master/examples/streamlit/01_dashboard.py @@ -23,7 +22,7 @@ View the [NLU&Streamlit documentation](https://nlu.johnsnowlabs.com/docs/en/stre The entire GIF demo and -## All NLU ressources overview +## All NLU resources overview Take a look at our official NLU page: [https://nlu.johnsnowlabs.com/](https://nlu.johnsnowlabs.com/) for user documentation and examples | Ressource | Description| @@ -218,8 +217,6 @@ Symbol or unwanted syntax, spellchecking, detecting entities, analyzing sentimen | NLU 20 Minutes Crashcourse - the fast Data Science route | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/webinars_conferences_etc/python_web_conf/NLU_crashcourse_py_web.ipynb) | - | - - # Need help? - [Ping us on Slack](https://spark-nlp.slack.com/archives/C0196BQCDPY) - [Post an issue on Github](https://github.com/JohnSnowLabs/nlu/issues) diff --git a/examples/webinars_conferences_etc/multi_lingual_webinar/models_that_need_to_be_upadted.md b/examples/webinars_conferences_etc/multi_lingual_webinar/models_that_need_to_be_upadted.md deleted file mode 100644 index c30e6f60..00000000 --- a/examples/webinars_conferences_etc/multi_lingual_webinar/models_that_need_to_be_upadted.md +++ /dev/null @@ -1,55 +0,0 @@ -# Health Care models : -| | name | language | libVersion | sparkVersion | readyToUse | time | isZipped | category | checksum | -| ---: | :---------------------------------- | :------- | :------------------- | :---------------- | :--------- | :------------------------------- | :------- | :------- | :------- | -| 306 | chunkresolve_icd10cm_hcc_clinical | en | {'parts': [2, 6, 3]} | {'parts': [2, 4]} | True | 2020-12-12 03:58:17.766000+00:00 | True | nd | | -| 307 | chunkresolve_icd10cm_hcc_healthcare | en | {'parts': [2, 6, 3]} | {'parts': [2, 4]} | True | 2020-12-12 04:23:11.972000+00:00 | True | nd | | -| 315 | textmatch_icd10cm | en | {'parts': [2, 6, 3]} | {'parts': [2, 4]} | True | 2020-12-23 19:43:53.108000+00:00 | True | nd | | -| 316 | textmatch_icd10cm | en | {'parts': [2, 6, 3]} | {'parts': [2, 4]} | True | 2020-12-23 19:57:16.404000+00:00 | True | nd | | -| 321 | jsl_ner_wip_clinical | en | {'parts': [2, 6, 5]} | {'parts': [2, 4]} | True | 2021-01-01 12:53:48.141000+00:00 | True | nd | | - - - -# Open source models : -| | name | language | libVersion | sparkVersion | readyToUse | time | isZipped | checksum | category | -| ---: | :------------------------------ | :------- | :------------------- | :---------------- | :--------- | :------------------------------- | :------- | :------- | :------- | -| 617 | lemma | ar | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-11-28 14:16:06.993000+00:00 | True | | ml | -| 619 | lemma | et | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-11-28 16:19:39.171000+00:00 | True | | ml | -| 620 | lemma | fa | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-11-28 16:32:07.793000+00:00 | True | | ml | -| 618 | lemma | ur | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-11-28 17:04:20.260000+00:00 | True | | ml | -| 627 | sentimentdl_urduvec_imdb | ur | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-12-01 10:05:35.630000+00:00 | True | | ml | -| 628 | ld_wiki_tatoeba_cnn_21 | xx | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-12-02 14:12:42.782000+00:00 | True | | ml | -| 629 | detect_language_21 | xx | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-12-02 14:12:51.556000+00:00 | True | | pl | -| 633 | ld_wiki_tatoeba_cnn_21 | xx | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-12-05 14:17:57.570000+00:00 | True | | ml | -| 634 | detect_language_21 | xx | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-12-05 15:11:20.664000+00:00 | True | | pl | -| 676 | lemma | he | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-12-09 14:04:44.355000+00:00 | True | | ml | -| 678 | google_t5_small_ssm_nq | en | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-12-21 12:01:13.257000+00:00 | True | | ml | -| 679 | t5_small | en | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2020-12-21 12:38:12.913000+00:00 | True | | ml | -| 1004 | sentence_detector_dl | xx | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-02 18:03:36.998000+00:00 | True | | ml | -| 1005 | sentence_detector_dl | en | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-02 18:10:52.663000+00:00 | True | | ml | -| 1337 | pos_ud_gsd | zh | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-03 18:42:08.856000+00:00 | True | | ml | -| 1338 | pos_ud_gsd | ja | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-03 18:55:50.824000+00:00 | True | | ml | -| 1345 | classifierdl_use_trec6 | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-08 15:01:02.425000+00:00 | True | | ml | -| 1346 | classifierdl_use_trec50 | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-08 15:05:28.412000+00:00 | True | | ml | -| 1349 | t5_small | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-08 19:13:39.885000+00:00 | True | | ml | -| 1351 | google_t5_small_ssm_nq | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-08 20:19:35.322000+00:00 | True | | ml | -| 1352 | sentimentdl_urduvec_imdb | ur | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-09 09:44:27.237000+00:00 | True | | ml | -| 1353 | classifierdl_use_spam | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-09 10:10:19.592000+00:00 | True | | ml | -| 1354 | classifierdl_use_fakenews | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-09 10:16:39.147000+00:00 | True | | ml | -| 1355 | classifierdl_use_cyberbullying | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-09 10:28:03.627000+00:00 | True | | ml | -| 1356 | classifierdl_use_emotion | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-09 11:09:23.302000+00:00 | True | | ml | -| 1357 | sentimentdl_glove_imdb | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-09 16:11:00.282000+00:00 | True | | ml | -| 1358 | classifierdl_use_sarcasm | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-09 16:49:16.231000+00:00 | True | | ml | -| 1360 | sentimentdl_use_imdb | en | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-15 12:54:07.685000+00:00 | True | | ml | -| 1364 | analyze_sentimentdl_glove_imdb | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-15 14:47:38.784000+00:00 | True | | pl | -| 1365 | analyze_sentimentdl_use_imdb | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-15 15:17:16.151000+00:00 | True | | pl | -| 1367 | lemma | ja | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-15 21:38:11.356000+00:00 | True | | ml | -| 1366 | lemma | ko | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-15 21:44:15.280000+00:00 | True | | ml | -| 1368 | sentimentdl_use_twitter | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-18 15:25:24.713000+00:00 | True | | ml | -| 1370 | lemma | bh | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-18 17:00:21.391000+00:00 | True | | ml | -| 1371 | analyze_sentimentdl_use_twitter | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-18 18:11:10.852000+00:00 | True | | pl | -| 1372 | lemma | bn | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-20 17:28:11.269000+00:00 | True | | ml | -| 1375 | lemma | am | {'parts': [2, 7, 0]} | {'parts': [2, 4]} | True | 2021-01-20 22:29:50.547000+00:00 | True | | ml | -| 1376 | multiclassifierdl_use_toxic_sm | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-21 12:04:05.484000+00:00 | True | | ml | -| 1377 | multiclassifierdl_use_toxic | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-21 12:20:04.648000+00:00 | True | | ml | -| 1378 | multiclassifierdl_use_e2e | en | {'parts': [2, 7, 1]} | {'parts': [2, 4]} | True | 2021-01-21 12:48:25.602000+00:00 | True | | ml | -| 1379 | spellcheck_dl | en | {'parts': [2, 7, 2]} | {'parts': [2, 4]} | True | 2021-01-23 09:27:45.565000+00:00 | True | | nd | \ No newline at end of file diff --git a/nlu/__init__.py b/nlu/__init__.py index 32798441..0b5cbd64 100644 --- a/nlu/__init__.py +++ b/nlu/__init__.py @@ -1,9 +1,10 @@ -__version__ = '3.3.0' +__version__ = '3.3.1' hard_offline_checks = False def version(): return __version__ + # if not check_pyspark_install(): raise Exception() def try_import_pyspark_in_streamlit(): """Try importing Pyspark or display warn message in streamlit""" @@ -92,6 +93,7 @@ def try_import_pyspark_in_streamlit(): # Embeddings from nlu.components.embeddings.albert.spark_nlp_albert import SparkNLPAlbert from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence +from nlu.components.embeddings.doc2vec.doc2vec import Doc2Vec from nlu.components.embeddings.bert.spark_nlp_bert import SparkNLPBert from nlu.components.embeddings.elmo.spark_nlp_elmo import SparkNLPElmo @@ -109,6 +111,8 @@ def try_import_pyspark_in_streamlit(): from nlu.components.classifiers.sentiment_dl.sentiment_dl import SentimentDl from nlu.components.classifiers.vivekn_sentiment.vivekn_sentiment_detector import ViveknSentiment from nlu.components.classifiers.pos.part_of_speech_jsl import PartOfSpeechJsl +from nlu.components.classifiers.seq_bert.seq_bert_classifier import SeqBertClassifier +from nlu.components.classifiers.seq_distilbert.seq_distilbert_classifier import SeqDilstilBertClassifier # matchers from nlu.components.matchers.date_matcher.date_matcher import DateMatcher @@ -182,6 +186,76 @@ def try_import_pyspark_in_streamlit(): import os +def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool = False, gpu: bool = False, + streamlit_caching: bool = False) -> NLUPipeline: + ''' + Load either a prebuild pipeline or a set of components identified by a whitespace seperated list of components + You must call nlu.auth() BEFORE calling nlu.load() to access licensed models. + If you did not call nlu.auth() but did call nlu.load() you must RESTART your Python Process and call nlu.auth(). + You cannot authorize once nlu.load() is called because of Spark Context. + :param verbose: + :param path: If path is not None, the model/pipe for the NLU reference will be loaded from the path. Useful for offline mode. Currently only loading entire NLU pipelines is supported, but not loading singular pipes + :param request: A NLU model/pipeline/component reference + :param version_checks: Wether to check if Pyspark is properly installed and if the Pyspark version is correct for the NLU version. If set to False, these tests will be skipped + :return: returns a non fitted nlu pipeline object + ''' + if streamlit_caching and not nlu.st_cache_enabled: + enable_streamlit_caching() + return nlu.load(request, path, verbose, gpu, streamlit_caching) + global is_authenticated + is_authenticated = True + auth(gpu=gpu) # check if secets are in default loc, if yes load them and create licensed context automatically + spark = get_open_source_spark_context(gpu) + spark.catalog.clearCache() + + # Enable PyArrow + spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") + spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled.", "true") + + if verbose: + enable_verbose() + else: + disable_verbose() + + if path != None: + logger.info(f'Trying to load nlu pipeline from local hard drive, located at {path}') + pipe = PipelineQueryVerifier.check_and_fix_nlu_pipeline(load_nlu_pipe_from_hdd(path, request)) + pipe.nlu_ref = request + return pipe + components_requested = request.split(' ') ## sentiment emotion yake + pipe = NLUPipeline() + language = parse_language_from_nlu_ref(request) + pipe.lang = language + pipe.nlu_ref = request + + try: + for nlu_ref in components_requested: + nlu_ref.replace(' ', '') + # component = component.lower() + if nlu_ref == '': continue + nlu_component = nlu_ref_to_component(nlu_ref, authenticated=is_authenticated) + # if we get a list of components, then the NLU reference is a pipeline, we do not need to check order + if type(nlu_component) == type([]): + # lists are parsed down to multiple components + for c in nlu_component: pipe.add(c, nlu_ref, pretrained_pipe_component=True) + else: + pipe.add(nlu_component, nlu_ref) + pipe = PipelineQueryVerifier.check_and_fix_nlu_pipeline(pipe) + pipe.nlu_ref = request + for c in pipe.components: + if c.info.license == 'licensed': pipe.has_licensed_components = True + return pipe + + except: + import sys + if verbose: + e = sys.exc_info() + print(e[0]) + print(e[1]) + raise Exception( + "Something went wrong during loading and fitting the pipe. Check the other prints for more information and also verbose mode. Did you use a correct model reference?") + + def auth(SPARK_NLP_LICENSE_OR_JSON_PATH='/content/spark_nlp_for_healthcare.json', AWS_ACCESS_KEY_ID='', AWS_SECRET_ACCESS_KEY='', JSL_SECRET='', gpu=False): """ Authenticate enviroment for JSL Liscensed models. Installs NLP-Healthcare if not in enviroment detected @@ -297,76 +371,6 @@ def enable_hard_offline_checks(): nlu.hard_offline_checks = True def disable_hard_offline_checks(): nlu.hard_offline_checks = False -def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool = False, gpu: bool = False, - streamlit_caching: bool = False) -> NLUPipeline: - ''' - Load either a prebuild pipeline or a set of components identified by a whitespace seperated list of components - You must call nlu.auth() BEFORE calling nlu.load() to access licensed models. - If you did not call nlu.auth() but did call nlu.load() you must RESTART your Python Process and call nlu.auth(). - You cannot authorize once nlu.load() is called because of Spark Context. - :param verbose: - :param path: If path is not None, the model/pipe for the NLU reference will be loaded from the path. Useful for offline mode. Currently only loading entire NLU pipelines is supported, but not loading singular pipes - :param request: A NLU model/pipeline/component reference - :param version_checks: Wether to check if Pyspark is properly installed and if the Pyspark version is correct for the NLU version. If set to False, these tests will be skipped - :return: returns a non fitted nlu pipeline object - ''' - if streamlit_caching and not nlu.st_cache_enabled: - enable_streamlit_caching() - return nlu.load(request, path, verbose, gpu, streamlit_caching) - global is_authenticated - is_authenticated = True - auth(gpu=gpu) # check if secets are in default loc, if yes load them and create licensed context automatically - spark = get_open_source_spark_context(gpu) - spark.catalog.clearCache() - - # Enable PyArrow - spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") - spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled.", "true") - - if verbose: - enable_verbose() - else: - disable_verbose() - - if path != None: - logger.info(f'Trying to load nlu pipeline from local hard drive, located at {path}') - pipe = PipelineQueryVerifier.check_and_fix_nlu_pipeline(load_nlu_pipe_from_hdd(path, request)) - pipe.nlu_ref = request - return pipe - components_requested = request.split(' ') ## sentiment emotion yake - pipe = NLUPipeline() - language = parse_language_from_nlu_ref(request) - pipe.lang = language - pipe.nlu_ref = request - - try: - for nlu_ref in components_requested: - nlu_ref.replace(' ', '') - # component = component.lower() - if nlu_ref == '': continue - nlu_component = nlu_ref_to_component(nlu_ref, authenticated=is_authenticated) - # if we get a list of components, then the NLU reference is a pipeline, we do not need to check order - if type(nlu_component) == type([]): - # lists are parsed down to multiple components - for c in nlu_component: pipe.add(c, nlu_ref, pretrained_pipe_component=True) - else: - pipe.add(nlu_component, nlu_ref) - pipe = PipelineQueryVerifier.check_and_fix_nlu_pipeline(pipe) - pipe.nlu_ref = request - for c in pipe.components: - if c.info.license == 'licensed': pipe.has_licensed_components = True - return pipe - - except : - import sys - if verbose: - e = sys.exc_info() - print(e[0]) - print(e[1]) - raise Exception( - "Something went wrong during loading and fitting the pipe. Check the other prints for more information and also verbose mode. Did you use a correct model reference?") - - class NluError: def __init__(self): self.has_trainable_components = False diff --git a/nlu/components/assertion.py b/nlu/components/assertion.py index 29f917f1..437adc6c 100644 --- a/nlu/components/assertion.py +++ b/nlu/components/assertion.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Asserter(SparkNLUComponent): def __init__(self, annotator_class='assertion_dl', lang='en', component_type='assertion', get_default=True, model = None, nlp_ref ='', nlu_ref='', trainable=False, is_licensed=False, loaded_from_pretrained_pipe=False): diff --git a/nlu/components/chunker.py b/nlu/components/chunker.py index 69fd127f..85e504ba 100644 --- a/nlu/components/chunker.py +++ b/nlu/components/chunker.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Chunker(SparkNLUComponent): diff --git a/nlu/components/classifier.py b/nlu/components/classifier.py index e5a7d125..7e4f7b9c 100644 --- a/nlu/components/classifier.py +++ b/nlu/components/classifier.py @@ -1,141 +1,233 @@ -from nlu.pipe.pipe_components import SparkNLUComponent -class Classifier(SparkNLUComponent): - def __init__(self, annotator_class='sentiment_dl', language='en', component_type='classifier', get_default=True, model = None, nlp_ref ='', nlu_ref='',trainable=False, is_licensed=False, do_ref_checks=True,loaded_from_pretrained_pipe=False): - if do_ref_checks: - if 'e2e' in nlu_ref or 'toxic' in nlu_ref : annotator_class= 'multi_classifier' - elif 'e2e' in nlp_ref or 'toxic' in nlp_ref : annotator_class= 'multi_classifier' - - elif 'token_bert' in nlp_ref or 'token_bert' in nlu_ref : annotator_class = 'token_bert' - elif 'token_distilbert' in nlp_ref or 'token_distilbert' in nlu_ref : annotator_class = 'token_distilbert' - elif 'token_distilroberta' in nlp_ref or 'token_distilroberta' in nlu_ref : annotator_class = 'token_roberta' - elif 'token_xlm_roberta' in nlp_ref or 'token_xlm_roberta' in nlu_ref : annotator_class = 'token_xlm_roberta' - elif 'token_roberta' in nlp_ref or 'token_roberta' in nlu_ref : annotator_class = 'token_roberta' - elif 'token_albert' in nlp_ref or 'token_albert' in nlu_ref : annotator_class = 'token_albert' - elif 'token_xlnet' in nlp_ref or 'token_xlnet' in nlu_ref : annotator_class = 'token_xlnet' - elif 'token_longformer' in nlp_ref or 'token_longformer' in nlu_ref : annotator_class = 'token_longformer' - - - elif 'multiclassifierdl' in nlp_ref : annotator_class= 'multi_classifier' - elif 'classifierdl' in nlp_ref: annotator_class= 'classifier_dl' +from nlu.pipe.pipe_component import SparkNLUComponent - elif 'yake' in nlu_ref: annotator_class= 'yake' - elif 'yake' in nlp_ref: annotator_class= 'yake' - elif 'sentimentdl' in nlp_ref : annotator_class= 'sentiment_dl' +class Classifier(SparkNLUComponent): + def __init__(self, annotator_class='sentiment_dl', language='en', component_type='classifier', get_default=True, + model=None, nlp_ref='', nlu_ref='', trainable=False, is_licensed=False, do_ref_checks=True, + loaded_from_pretrained_pipe=False): + if do_ref_checks: + if 'e2e' in nlu_ref or 'toxic' in nlu_ref: + annotator_class = 'multi_classifier' + elif 'e2e' in nlp_ref or 'toxic' in nlp_ref: + annotator_class = 'multi_classifier' + elif 'distilbert_sequence' in nlp_ref or 'distilbert_sequence' in nlu_ref: + annotator_class = 'seq_distilbert' + elif 'bert_sequence' in nlp_ref or 'bert_sequence' in nlu_ref: + annotator_class = 'seq_bert' + elif 'token_bert' in nlp_ref or 'token_bert' in nlu_ref: + annotator_class = 'token_bert' + elif 'token_distilbert' in nlp_ref or 'token_distilbert' in nlu_ref: + annotator_class = 'token_distilbert' + elif 'token_distilroberta' in nlp_ref or 'token_distilroberta' in nlu_ref: + annotator_class = 'token_roberta' + elif 'token_xlm_roberta' in nlp_ref or 'token_xlm_roberta' in nlu_ref: + annotator_class = 'token_xlm_roberta' + elif 'token_roberta' in nlp_ref or 'token_roberta' in nlu_ref: + annotator_class = 'token_roberta' + elif 'token_albert' in nlp_ref or 'token_albert' in nlu_ref: + annotator_class = 'token_albert' + elif 'token_xlnet' in nlp_ref or 'token_xlnet' in nlu_ref: + annotator_class = 'token_xlnet' + elif 'token_longformer' in nlp_ref or 'token_longformer' in nlu_ref: + annotator_class = 'token_longformer' + elif 'multiclassifierdl' in nlp_ref: + annotator_class = 'multi_classifier' + elif 'classifierdl' in nlp_ref: + annotator_class = 'classifier_dl' + elif 'yake' in nlu_ref: + annotator_class = 'yake' + elif 'yake' in nlp_ref: + annotator_class = 'yake' + elif 'sentimentdl' in nlp_ref: + annotator_class = 'sentiment_dl' - elif 'vivekn' in nlp_ref or 'vivekn' in nlp_ref : annotator_class= 'vivekn_sentiment' + elif 'vivekn' in nlp_ref or 'vivekn' in nlp_ref: + annotator_class = 'vivekn_sentiment' - elif 'wiki_' in nlu_ref or 'wiki_' in nlp_ref : annotator_class= 'language_detector' - elif 'pos' in nlu_ref and 'ner' not in nlu_ref: annotator_class= 'pos' - elif 'pos' in nlp_ref and 'ner' not in nlp_ref: annotator_class= 'pos' + elif 'wiki_' in nlu_ref or 'wiki_' in nlp_ref: + annotator_class = 'language_detector' + elif 'pos' in nlu_ref and 'ner' not in nlu_ref: + annotator_class = 'pos' + elif 'pos' in nlp_ref and 'ner' not in nlp_ref: + annotator_class = 'pos' - elif 'icd' in nlu_ref and 'med_ner' not in nlu_ref: annotator_class= 'classifier_dl' - elif 'med_ner' in nlu_ref: annotator_class= 'ner_healthcare' - elif 'generic_classifier' in nlu_ref: annotator_class= 'generic_classifier' - elif 'ner' in nlu_ref and 'generic' not in nlu_ref : annotator_class= 'ner' - elif 'ner' in nlp_ref and 'generic' not in nlp_ref : annotator_class= 'ner' + elif 'icd' in nlu_ref and 'med_ner' not in nlu_ref: + annotator_class = 'classifier_dl' + elif 'med_ner' in nlu_ref: + annotator_class = 'ner_healthcare' + elif 'generic_classifier' in nlu_ref: + annotator_class = 'generic_classifier' + elif 'ner' in nlu_ref and 'generic' not in nlu_ref: + annotator_class = 'ner' + elif 'ner' in nlp_ref and 'generic' not in nlp_ref: + annotator_class = 'ner' - if model != None : + if model != None: self.model = model - from sparknlp.annotator import NerDLModel,NerCrfModel - if isinstance(self.model, (NerDLModel,NerCrfModel)): self.model.setIncludeConfidence(True) - elif is_licensed : + from sparknlp.annotator import NerDLModel, NerCrfModel + if isinstance(self.model, (NerDLModel, NerCrfModel)): + self.model.setIncludeConfidence(True) + elif is_licensed: from sparknlp_jsl.annotator import MedicalNerModel if isinstance(self.model, MedicalNerModel): self.model.setIncludeConfidence(True) - - else : - if 'sentiment' in annotator_class and 'vivekn' not in annotator_class: + else: + if 'seq_distilbert' == annotator_class: + from nlu import SeqDilstilBertClassifier + if get_default: + self.model = SeqDilstilBertClassifier.get_default_model() + elif is_licensed: + self.model = SeqDilstilBertClassifier.get_pretrained_model(nlp_ref, language, 'clinical/models') + else: + self.model = SeqDilstilBertClassifier.get_pretrained_model(nlp_ref, language) + elif 'seq_bert' == annotator_class: + from nlu import SeqBertClassifier + if get_default: + self.model = SeqBertClassifier.get_default_model() + elif is_licensed: + self.model = SeqBertClassifier.get_pretrained_model(nlp_ref, language, 'clinical/models') + else: + self.model = SeqBertClassifier.get_pretrained_model(nlp_ref, language) + elif 'sentiment' in annotator_class and 'vivekn' not in annotator_class: from nlu import SentimentDl - if trainable : self.model = SentimentDl.get_default_trainable_model() - elif is_licensed : self.model = SentimentDl.get_pretrained_model(nlp_ref, language, bucket='clinical/models') - elif get_default : self.model = SentimentDl.get_default_model() - else : self.model = SentimentDl.get_pretrained_model(nlp_ref, language) - elif 'token_distilbert' == annotator_class : + if trainable: + self.model = SentimentDl.get_default_trainable_model() + elif is_licensed: + self.model = SentimentDl.get_pretrained_model(nlp_ref, language, bucket='clinical/models') + elif get_default: + self.model = SentimentDl.get_default_model() + else: + self.model = SentimentDl.get_pretrained_model(nlp_ref, language) + elif 'token_distilbert' == annotator_class: from nlu import TokenDistilBert - if get_default: self.model = TokenDistilBert.get_default_model() - elif is_licensed : self.model = TokenDistilBert.get_pretrained_model(nlp_ref, language,'clinical/models') - else : self.model = TokenDistilBert.get_pretrained_model(nlp_ref, language) - elif 'token_bert' == annotator_class : + if get_default: + self.model = TokenDistilBert.get_default_model() + elif is_licensed: + self.model = TokenDistilBert.get_pretrained_model(nlp_ref, language, 'clinical/models') + else: + self.model = TokenDistilBert.get_pretrained_model(nlp_ref, language) + elif 'token_bert' == annotator_class: from nlu import TokenBert - if get_default: self.model = TokenBert.get_default_model() - elif is_licensed : self.model = TokenBert.get_pretrained_model(nlp_ref, language,'clinical/models') - else : self.model = TokenBert.get_pretrained_model(nlp_ref, language) - elif 'token_xlm_roberta' == annotator_class : + if get_default: + self.model = TokenBert.get_default_model() + elif is_licensed: + self.model = TokenBert.get_pretrained_model(nlp_ref, language, 'clinical/models') + else: + self.model = TokenBert.get_pretrained_model(nlp_ref, language) + elif 'token_xlm_roberta' == annotator_class: from nlu import TokenXlmRoBerta - if get_default: self.model = TokenXlmRoBerta.get_default_model() - elif is_licensed : self.model = TokenXlmRoBerta.get_pretrained_model(nlp_ref, language,'clinical/models') - else : self.model = TokenXlmRoBerta.get_pretrained_model(nlp_ref, language) - elif 'token_roberta' == annotator_class : + if get_default: + self.model = TokenXlmRoBerta.get_default_model() + elif is_licensed: + self.model = TokenXlmRoBerta.get_pretrained_model(nlp_ref, language, 'clinical/models') + else: + self.model = TokenXlmRoBerta.get_pretrained_model(nlp_ref, language) + elif 'token_roberta' == annotator_class: from nlu import TokenRoBerta - if get_default: self.model = TokenRoBerta.get_default_model() - elif is_licensed : self.model = TokenRoBerta.get_pretrained_model(nlp_ref, language,'clinical/models') - else : self.model = TokenRoBerta.get_pretrained_model(nlp_ref, language) - elif 'token_albert' == annotator_class : + if get_default: + self.model = TokenRoBerta.get_default_model() + elif is_licensed: + self.model = TokenRoBerta.get_pretrained_model(nlp_ref, language, 'clinical/models') + else: + self.model = TokenRoBerta.get_pretrained_model(nlp_ref, language) + elif 'token_albert' == annotator_class: from nlu import TokenAlbert - if get_default: self.model = TokenAlbert.get_default_model() - elif is_licensed : self.model = TokenAlbert.get_pretrained_model(nlp_ref, language,'clinical/models') - else : self.model = TokenAlbert.get_pretrained_model(nlp_ref, language) - elif 'token_longformer' == annotator_class : + if get_default: + self.model = TokenAlbert.get_default_model() + elif is_licensed: + self.model = TokenAlbert.get_pretrained_model(nlp_ref, language, 'clinical/models') + else: + self.model = TokenAlbert.get_pretrained_model(nlp_ref, language) + elif 'token_longformer' == annotator_class: from nlu import TokenLongFormer - if get_default: self.model = TokenLongFormer.get_default_model() - elif is_licensed : self.model = TokenLongFormer.get_pretrained_model(nlp_ref, language,'clinical/models') - else : self.model = TokenLongFormer.get_pretrained_model(nlp_ref, language) - elif 'token_xlnet' == annotator_class : + if get_default: + self.model = TokenLongFormer.get_default_model() + elif is_licensed: + self.model = TokenLongFormer.get_pretrained_model(nlp_ref, language, 'clinical/models') + else: + self.model = TokenLongFormer.get_pretrained_model(nlp_ref, language) + elif 'token_xlnet' == annotator_class: from nlu import TokenXlnet - if get_default: self.model = TokenXlnet.get_default_model() - elif is_licensed : self.model = TokenXlnet.get_pretrained_model(nlp_ref, language,'clinical/models') - else : self.model = TokenXlnet.get_pretrained_model(nlp_ref, language) + if get_default: + self.model = TokenXlnet.get_default_model() + elif is_licensed: + self.model = TokenXlnet.get_pretrained_model(nlp_ref, language, 'clinical/models') + else: + self.model = TokenXlnet.get_pretrained_model(nlp_ref, language) elif 'generic_classifier' in annotator_class: from nlu.components.classifiers.generic_classifier.generic_classifier import GenericClassifier - if trainable : self.model = GenericClassifier.get_default_trainable_model() - else : self.model = GenericClassifier.get_pretrained_model(nlp_ref, language, bucket='clinical/models') + if trainable: + self.model = GenericClassifier.get_default_trainable_model() + else: + self.model = GenericClassifier.get_pretrained_model(nlp_ref, language, bucket='clinical/models') elif 'vivekn' in annotator_class: from nlu import ViveknSentiment - if get_default : self.model = ViveknSentiment.get_default_model() - else : self.model = ViveknSentiment.get_pretrained_model(nlp_ref, language) - elif 'ner' in annotator_class and 'ner_healthcare' not in annotator_class: + if get_default: + self.model = ViveknSentiment.get_default_model() + else: + self.model = ViveknSentiment.get_pretrained_model(nlp_ref, language) + elif 'ner' in annotator_class and 'ner_healthcare' not in annotator_class: from nlu import NERDL - if trainable : self.model = NERDL.get_default_trainable_model() - elif is_licensed : self.model = NERDL.get_pretrained_model(nlp_ref, language, bucket='clinical/models') - elif get_default : self.model = NERDL.get_default_model() - else : self.model = NERDL.get_pretrained_model(nlp_ref, language) + if trainable: + self.model = NERDL.get_default_trainable_model() + elif is_licensed: + self.model = NERDL.get_pretrained_model(nlp_ref, language, bucket='clinical/models') + elif get_default: + self.model = NERDL.get_default_model() + else: + self.model = NERDL.get_pretrained_model(nlp_ref, language) if hasattr(self, 'model'): self.model.setIncludeConfidence(True) elif 'ner.crf' in annotator_class: from nlu import NERDLCRF - if get_default : self.model = NERDLCRF.get_default_model() - else : self.model = NERDLCRF.get_pretrained_model(nlp_ref, language) + if get_default: + self.model = NERDLCRF.get_default_model() + else: + self.model = NERDLCRF.get_pretrained_model(nlp_ref, language) if hasattr(self, 'model'): self.model.setIncludeConfidence(True) elif ('classifier_dl' in annotator_class or annotator_class == 'toxic') and not 'multi' in annotator_class: from nlu import ClassifierDl - if trainable: self.model = ClassifierDl.get_trainable_model() - elif is_licensed : self.model = ClassifierDl.get_pretrained_model(nlp_ref, language, bucket='clinical/models') - elif get_default : self.model = ClassifierDl.get_default_model() - else : self.model = ClassifierDl.get_pretrained_model(nlp_ref, language) - if hasattr(self.model,'setIncludeConfidence'): self.model.setIncludeConfidence(True) + if trainable: + self.model = ClassifierDl.get_trainable_model() + elif is_licensed: + self.model = ClassifierDl.get_pretrained_model(nlp_ref, language, bucket='clinical/models') + elif get_default: + self.model = ClassifierDl.get_default_model() + else: + self.model = ClassifierDl.get_pretrained_model(nlp_ref, language) + if hasattr(self.model, 'setIncludeConfidence'): self.model.setIncludeConfidence(True) elif 'language_detector' in annotator_class: from nlu import LanguageDetector - if get_default : self.model = LanguageDetector.get_default_model() - else: self.model = LanguageDetector.get_pretrained_model(nlp_ref, language) + if get_default: + self.model = LanguageDetector.get_default_model() + else: + self.model = LanguageDetector.get_pretrained_model(nlp_ref, language) elif 'pos' in annotator_class: from nlu import PartOfSpeechJsl - if trainable : self.model = PartOfSpeechJsl.get_default_trainable_model() - elif get_default : self.model = PartOfSpeechJsl.get_default_model() - elif is_licensed : self.model = PartOfSpeechJsl.get_pretrained_model(nlp_ref, language, bucket='clinical/models') - else : self.model = PartOfSpeechJsl.get_pretrained_model(nlp_ref, language) + if trainable: + self.model = PartOfSpeechJsl.get_default_trainable_model() + elif get_default: + self.model = PartOfSpeechJsl.get_default_model() + elif is_licensed: + self.model = PartOfSpeechJsl.get_pretrained_model(nlp_ref, language, bucket='clinical/models') + else: + self.model = PartOfSpeechJsl.get_pretrained_model(nlp_ref, language) elif 'yake' in annotator_class: from nlu import Yake - self.model = Yake.get_default_model() - elif 'multi_classifier' in annotator_class : + self.model = Yake.get_default_model() + elif 'multi_classifier' in annotator_class: from nlu import MultiClassifier - if trainable : self.model = MultiClassifier.get_default_trainable_model() - elif get_default : self.model = MultiClassifier.get_default_model() - else : self.model = MultiClassifier.get_pretrained_model(nlp_ref, language) + if trainable: + self.model = MultiClassifier.get_default_trainable_model() + elif get_default: + self.model = MultiClassifier.get_default_model() + else: + self.model = MultiClassifier.get_pretrained_model(nlp_ref, language) elif 'ner_healthcare' in annotator_class: from nlu.components.classifiers.ner_healthcare.ner_dl_healthcare import NERDLHealthcare - if trainable : self.model = NERDLHealthcare.get_default_trainable_model() - else : self.model = NERDLHealthcare.get_pretrained_model(nlp_ref, language, bucket='clinical/models') - - + if trainable: + self.model = NERDLHealthcare.get_default_trainable_model() + else: + self.model = NERDLHealthcare.get_pretrained_model(nlp_ref, language, bucket='clinical/models') - SparkNLUComponent.__init__(self, annotator_class, component_type, nlu_ref, nlp_ref, language,loaded_from_pretrained_pipe , is_licensed) + SparkNLUComponent.__init__(self, annotator_class, component_type, nlu_ref, nlp_ref, language, + loaded_from_pretrained_pipe, is_licensed) diff --git a/nlu/components/classifiers/seq_bert/__init__.py b/nlu/components/classifiers/seq_bert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/components/classifiers/seq_bert/component_infos.json b/nlu/components/classifiers/seq_bert/component_infos.json new file mode 100644 index 00000000..4c92d921 --- /dev/null +++ b/nlu/components/classifiers/seq_bert/component_infos.json @@ -0,0 +1,25 @@ +{ + "name": "seq_bert", + "output_level": "sentence", + "description": "todo", + "trainable": false, + "outputs": [ + "category" + ], + "inputs": [ + "token", + "sentence" + ], + "type": "classifier", + "spark_input_column_names": [ + "token", + "sentence" + ], + "spark_output_column_names": [ + "category" + ], + "provider": "sparknlp", + "license": "open source", + "computation_context": "spark", + "output_context": "spark" +} \ No newline at end of file diff --git a/nlu/components/classifiers/seq_bert/seq_bert_classifier.py b/nlu/components/classifiers/seq_bert/seq_bert_classifier.py new file mode 100644 index 00000000..8dffd7d6 --- /dev/null +++ b/nlu/components/classifiers/seq_bert/seq_bert_classifier.py @@ -0,0 +1,20 @@ +from sparknlp.annotator import * +class SeqBertClassifier: + @staticmethod + def get_default_model(): + return BertForSequenceClassification.pretrained() \ + .setInputCols(["token", "sentence"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + return BertForSequenceClassification.pretrained(name, language, bucket) \ + .setInputCols(["token", "sentence"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) + + + + + diff --git a/nlu/components/classifiers/seq_distilbert/__init__.py b/nlu/components/classifiers/seq_distilbert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/components/classifiers/seq_distilbert/component_infos.json b/nlu/components/classifiers/seq_distilbert/component_infos.json new file mode 100644 index 00000000..fde73722 --- /dev/null +++ b/nlu/components/classifiers/seq_distilbert/component_infos.json @@ -0,0 +1,25 @@ +{ + "name": "seq_distilbert", + "output_level": "sentence", + "description": "todo", + "trainable": false, + "outputs": [ + "category" + ], + "inputs": [ + "token", + "sentence" + ], + "type": "classifier", + "spark_input_column_names": [ + "token", + "sentence" + ], + "spark_output_column_names": [ + "category" + ], + "provider": "sparknlp", + "license": "open source", + "computation_context": "spark", + "output_context": "spark" +} \ No newline at end of file diff --git a/nlu/components/classifiers/seq_distilbert/seq_distilbert_classifier.py b/nlu/components/classifiers/seq_distilbert/seq_distilbert_classifier.py new file mode 100644 index 00000000..a2f232f9 --- /dev/null +++ b/nlu/components/classifiers/seq_distilbert/seq_distilbert_classifier.py @@ -0,0 +1,21 @@ +from sparknlp.annotator import * +class SeqDilstilBertClassifier: + @staticmethod + def get_default_model(): + return DistilBertForSequenceClassification.pretrained() \ + .setInputCols(["token", "sentence"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + DistilBertForSequenceClassification.name = "DistilBertForSequenceClassification" + return DistilBertForSequenceClassification.pretrained(name, language, bucket) \ + .setInputCols(["token", "sentence"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) + + + + + diff --git a/nlu/components/deidentification.py b/nlu/components/deidentification.py index dac2fb01..e0eca8bd 100644 --- a/nlu/components/deidentification.py +++ b/nlu/components/deidentification.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Deidentification(SparkNLUComponent): def __init__(self, annotator_class='deidentifier', lang='en', component_type='deidentifier', get_default=False, model = None, nlp_ref ='', nlu_ref='', trainable=False, is_licensed=True,loaded_from_pretrained_pipe=False): annotator_class= 'deidentifier' diff --git a/nlu/components/embedding.py b/nlu/components/embedding.py index 76384ddc..066a5771 100644 --- a/nlu/components/embedding.py +++ b/nlu/components/embedding.py @@ -1,13 +1,13 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Embeddings(SparkNLUComponent): - def __init__(self, annotator_class='glove', lang ='en', component_type='embedding', get_default=True, model = None, nlp_ref ='', nlu_ref ='', is_licensed=False, resolution_ref='',loaded_from_pretrained_pipe=False,do_ref_checks=True ): if do_ref_checks: if 'use' in nlu_ref and 'bert' not in nlu_ref or 'tfhub_use' in nlp_ref and 'bert' not in nlp_ref: annotator_class = 'use' # first check for sentence then token embeddings. elif 'longformer' in nlu_ref : annotator_class = 'longformer' + elif 'doc2vec' in nlu_ref : annotator_class = 'doc2vec' elif 'sent' in nlu_ref and 'xlm_roberta' in nlu_ref : annotator_class = 'sentence_xlm' elif 'xlm' in nlu_ref or 'xlm' in nlp_ref : annotator_class = 'xlm' @@ -53,6 +53,10 @@ def __init__(self, annotator_class='glove', lang ='en', component_type='embeddin from nlu import Sentence_XLM if get_default: self.model = Sentence_XLM.get_default_model() else : self.model = Sentence_XLM.get_pretrained_model(nlp_ref, lang) + elif 'doc2vec' == annotator_class : + from nlu import Doc2Vec + if get_default: self.model = Doc2Vec.get_default_model() + else : self.model = Doc2Vec.get_pretrained_model(nlp_ref, lang) elif 'longformer' == annotator_class : from nlu import Longformer if get_default: self.model = Longformer.get_default_model() diff --git a/nlu/components/embeddings/doc2vec/__init__.py b/nlu/components/embeddings/doc2vec/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/components/embeddings/doc2vec/component_infos.json b/nlu/components/embeddings/doc2vec/component_infos.json new file mode 100644 index 00000000..80393898 --- /dev/null +++ b/nlu/components/embeddings/doc2vec/component_infos.json @@ -0,0 +1,23 @@ +{ + "name": "doc2vec", + "description": "todo", + "output_level": "input_dependent", + "trainable": false, + "outputs": [ + "sentence_embeddings" + ], + "inputs": [ + "token" + ], + "type": "sentence_embeddings", + "spark_input_column_names": [ + "token" + ], + "spark_output_column_names": [ + "sentence_embeddings" + ], + "provider": "sparknlp", + "license": "open source", + "computation_context": "spark", + "output_context": "spark" +} \ No newline at end of file diff --git a/nlu/components/embeddings/doc2vec/doc2vec.py b/nlu/components/embeddings/doc2vec/doc2vec.py new file mode 100644 index 00000000..c9e3f01f --- /dev/null +++ b/nlu/components/embeddings/doc2vec/doc2vec.py @@ -0,0 +1,22 @@ +from sparknlp.annotator import * + + +class Doc2Vec: + @staticmethod + def get_default_model(): + return Doc2VecModel.pretrained() \ + .setInputCols("token") \ + .setOutputCol("sentence_embeddings") + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + return Doc2VecModel.pretrained(name,language,bucket) \ + .setInputCols("token") \ + .setOutputCol("sentence_embeddings") + @staticmethod + def get_trainable_model(): + return Doc2VecApproach()\ + .setInputCols("token") \ + .setOutputCol("sentence_embeddings") + + diff --git a/nlu/components/embeddings_chunker.py b/nlu/components/embeddings_chunker.py index bc55696e..cfc060ce 100644 --- a/nlu/components/embeddings_chunker.py +++ b/nlu/components/embeddings_chunker.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class EmbeddingsChunker(SparkNLUComponent): diff --git a/nlu/components/labeled_dependency_parser.py b/nlu/components/labeled_dependency_parser.py index 9d7ff2d7..17a47f00 100644 --- a/nlu/components/labeled_dependency_parser.py +++ b/nlu/components/labeled_dependency_parser.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class LabeledDependencyParser(SparkNLUComponent): def __init__(self, annotator_class='labeled_dependency_parser', language ='en', component_type='dependency_typed', get_default=True, nlp_ref='', nlu_ref='', model=None,loaded_from_pretrained_pipe=False,is_licensed=False): diff --git a/nlu/components/lemmatizer.py b/nlu/components/lemmatizer.py index 9b112e0c..af74550a 100644 --- a/nlu/components/lemmatizer.py +++ b/nlu/components/lemmatizer.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Lemmatizer(SparkNLUComponent): diff --git a/nlu/components/matcher.py b/nlu/components/matcher.py index 6d8f677b..06bae5ae 100644 --- a/nlu/components/matcher.py +++ b/nlu/components/matcher.py @@ -1,5 +1,5 @@ import nlu -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Matcher(SparkNLUComponent): diff --git a/nlu/components/nlu_tokenizer.py b/nlu/components/nlu_tokenizer.py index 4da2dd5a..137ca4b9 100644 --- a/nlu/components/nlu_tokenizer.py +++ b/nlu/components/nlu_tokenizer.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent import nlu diff --git a/nlu/components/normalizer.py b/nlu/components/normalizer.py index f183001c..00a0d23d 100644 --- a/nlu/components/normalizer.py +++ b/nlu/components/normalizer.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Normalizer(SparkNLUComponent): def __init__(self, annotator_class='normalizer', language='en', component_type='normalizer', get_default=True, nlp_ref='',nlu_ref='',model=None, is_licensed=False, loaded_from_pretrained_pipe=False): diff --git a/nlu/components/relation.py b/nlu/components/relation.py index 48bc6851..feab3545 100644 --- a/nlu/components/relation.py +++ b/nlu/components/relation.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Relation(SparkNLUComponent): def __init__(self, annotator_class='relation_extractor', lang='en', component_type='relation_extractor', get_default=True, model = None, nlp_ref ='', nlu_ref='', trainable=False, is_licensed=False,loaded_from_pretrained_pipe=False): diff --git a/nlu/components/resolution.py b/nlu/components/resolution.py index a3a436f5..b1133f38 100644 --- a/nlu/components/resolution.py +++ b/nlu/components/resolution.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Resolver(SparkNLUComponent): def __init__(self, annotator_class='sentence_entity_resolver', language='en', component_type='resolution', get_default=True, model = None, nlp_ref ='', nlu_ref='',trainable=False, is_licensed=True, loaded_from_pretrained_pipe=False): diff --git a/nlu/components/sentence_detector.py b/nlu/components/sentence_detector.py index e36477f8..78e7581c 100644 --- a/nlu/components/sentence_detector.py +++ b/nlu/components/sentence_detector.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class NLUSentenceDetector(SparkNLUComponent): def __init__(self, annotator_class='sentence_detector', language='en', component_type='sentence_detector', get_default=True, model = None, nlp_ref='', nlu_ref='', trainable=False, is_licensed=False,lang='en',loaded_from_pretrained_pipe=False): diff --git a/nlu/components/sequence2sequence.py b/nlu/components/sequence2sequence.py index 7d20e24a..344d99a9 100644 --- a/nlu/components/sequence2sequence.py +++ b/nlu/components/sequence2sequence.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Seq2Seq(SparkNLUComponent): diff --git a/nlu/components/spell_checker.py b/nlu/components/spell_checker.py index f49ecfbd..b0aadae9 100644 --- a/nlu/components/spell_checker.py +++ b/nlu/components/spell_checker.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class SpellChecker(SparkNLUComponent): def __init__(self, annotator_class='context_spell', language ='en', component_type='spell_checker', get_default=True, model = None, nlp_ref='', dataset='', nlu_ref ='', is_licensed=False, loaded_from_pretrained_pipe=True): diff --git a/nlu/components/stemmer.py b/nlu/components/stemmer.py index f90f1abf..fe197de2 100644 --- a/nlu/components/stemmer.py +++ b/nlu/components/stemmer.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Stemmer(SparkNLUComponent): diff --git a/nlu/components/stopwordscleaner.py b/nlu/components/stopwordscleaner.py index b9036d4f..ab14de33 100644 --- a/nlu/components/stopwordscleaner.py +++ b/nlu/components/stopwordscleaner.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class StopWordsCleaner(SparkNLUComponent): diff --git a/nlu/components/unlabeled_dependency_parser.py b/nlu/components/unlabeled_dependency_parser.py index f65b185b..1ce2596c 100644 --- a/nlu/components/unlabeled_dependency_parser.py +++ b/nlu/components/unlabeled_dependency_parser.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class UnlabeledDependencyParser(SparkNLUComponent): diff --git a/nlu/components/util.py b/nlu/components/util.py index 152720b8..bb5de295 100644 --- a/nlu/components/util.py +++ b/nlu/components/util.py @@ -1,4 +1,4 @@ -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent class Util(SparkNLUComponent): diff --git a/nlu/ocr_components/__init__.py b/nlu/ocr_components/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/pipe/col_substitution/col_substitution_OS.py b/nlu/pipe/col_substitution/col_substitution_OS.py index 143b1939..20b707a0 100644 --- a/nlu/pipe/col_substitution/col_substitution_OS.py +++ b/nlu/pipe/col_substitution/col_substitution_OS.py @@ -147,6 +147,27 @@ def substitute_transformer_token_classifier_cols(c, cols, is_unique=True): # new_cols[col]= f"{new_base_name}_confidence" return new_cols +def substitute_seq_bert_classifier_cols(c, cols, is_unique=True): + """ + Token classifier + """ + new_cols = {} + new_base_name = 'classified_token'# if is_unique else f'document_{nlu_identifier}' + for col in cols : + if '_results' in col : new_cols[col] = new_base_name + elif '_beginnings' in col : new_cols[col] = f'{new_base_name}_begin' + elif '_endings' in col : new_cols[col] = f'{new_base_name}_end' + elif '_embeddings' in col and 'Some' not in col : continue # Token never stores Embeddings new_cols[col] = f'{new_base_name}_embedding' + elif '_types' in col : continue # new_cols[col] = f'{new_base_name}_type' + elif 'meta' in col: + if '_sentence' in col : new_cols[col] = f'{new_base_name}_origin_sentence' # maps to which sentence token comes from + if 'Some' in col : new_cols[col] = f"'{new_base_name}_{col.split('Some(')[-1].split(')')[0]}_confidence" # maps to which sentence token comes from + if 'meta_' in col : new_cols[col] = f"'{new_base_name}_{col.split('meta_')[-1]}_confidence" # maps to which sentence token comes from + else : logger.info(f'Dropping unmatched metadata_col={col} for c={c}') + # new_cols[col]= f"{new_base_name}_confidence" + return new_cols + + def substitute_word_embed_cols(c, cols, nlu_identifier=True): """ diff --git a/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py b/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py index b0f4e476..e5ae5e22 100644 --- a/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py +++ b/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py @@ -50,7 +50,8 @@ DistilBertForTokenClassification, BertForTokenClassification, LongformerEmbeddings, - + DistilBertForSequenceClassification, + BertForSequenceClassification, # approaches ViveknSentimentApproach , SentimentDLApproach , diff --git a/nlu/pipe/col_substitution/substitution_map_OS.py b/nlu/pipe/col_substitution/substitution_map_OS.py index 6047a01a..069c8f25 100644 --- a/nlu/pipe/col_substitution/substitution_map_OS.py +++ b/nlu/pipe/col_substitution/substitution_map_OS.py @@ -11,7 +11,7 @@ """ from sparknlp.annotator import * from sparknlp.base import * -from nlu.pipe.extractors.extractor_configs_open_source import * +from nlu.pipe.extractors.extractor_configs_OS import * from nlu.pipe.col_substitution.col_substitution_OS import * OS_anno2substitution_fn = { @@ -51,6 +51,10 @@ 'default': substitute_sent_embed_cols, }, + Doc2VecModel: { + 'default': substitute_sent_embed_cols, + }, + XlmRoBertaSentenceEmbeddings: { 'default': substitute_sent_embed_cols, }, @@ -146,7 +150,6 @@ 'default': substitute_stopwords_cols, }, - BertForTokenClassification: { 'default': substitute_transformer_token_classifier_cols, }, @@ -159,12 +162,10 @@ 'default': substitute_transformer_token_classifier_cols, }, - XlmRoBertaForTokenClassification: { 'default': substitute_transformer_token_classifier_cols, }, - RoBertaForTokenClassification: { 'default': substitute_transformer_token_classifier_cols, }, @@ -173,13 +174,16 @@ 'default': substitute_transformer_token_classifier_cols, }, - AlbertForTokenClassification: { 'default': substitute_transformer_token_classifier_cols, }, - - + BertForSequenceClassification: { + 'default': substitute_seq_bert_classifier_cols + }, + DistilBertForSequenceClassification: { + 'default': substitute_seq_bert_classifier_cols + }, TextMatcherModel: { 'default': substitute_text_match_cols, }, @@ -196,9 +200,9 @@ Doc2Chunk: { 'default': substitute_doc2chunk_cols, }, -# + # Chunk2Doc: { - 'default': substitute_doc2chunk_cols, # TODO better? + 'default': substitute_doc2chunk_cols, # TODO better? }, T5Transformer: { 'default': substitute_T5_cols, @@ -213,6 +217,7 @@ 'default': substitute_word_seg_cols, }, + # approaches ViveknSentimentApproach: {'default': substitute_sentiment_vivk_approach_cols, 'default_full': default_full_config, }, @@ -222,5 +227,6 @@ 'default_full': default_full_config, }, NerDLApproach: {'default': substitute_ner_dl_approach_cols, 'default_full': default_full_config, }, PerceptronApproach: {'default': substitute_pos_approach_cols, 'default_full': default_full_config, }, + Doc2VecApproach: {'default': substitute_sent_embed_cols}, } diff --git a/nlu/pipe/component_resolution.py b/nlu/pipe/component_resolution.py index 6da3c72d..a85e2726 100644 --- a/nlu/pipe/component_resolution.py +++ b/nlu/pipe/component_resolution.py @@ -2,28 +2,10 @@ Contains methods used to resolve a NLU reference to a NLU component. Handler for getting default components, etcc. ''' -# <<>> -# 1. parse NAMLE data and RETURN IT -> Detect if OC or Closed source (CS) -# 2. based on wether OC or CS, use according component resolver -# 2.1 if CS_annotator, then verify licsence/ authenticate, if not do the usual (Make sure all CS imports are in seperate files) -# 3. Put all components in NLU pipe and return it -# - - -# <<>> -# 1, transform DF -# 2. Integrate outoputlevel of new annotators by getting some attriubte/str name from them. -# We cannot do isInstance() because we cannot import classes for the cmparioson -# Thus, for OUtput_level inference of singular components and the entiure pipe -# we must first check OC compoments vanilla style and if that fails we must do special infer_CS_component_level() all -# This call must infer the output level without type checks, i.e. use component infos or some string map or some trick (( !! component info!!!) - -# 2. Squeeze in 9 Annotators in old extraction process, most annotators are like old ones -# + import nlu.utils.environment.authentication as auth_utils import nlu.utils.environment.offline_load_utils as offline_utils from pyspark.ml import PipelineModel -from sparknlp.base import DocumentAssembler from sparknlp.annotator import * import nlu from nlu import logger, Util, Embeddings, Classifier, Spellbook, ClassifierDl, NLUSentenceDetector, NGram, Seq2Seq, \ @@ -34,27 +16,15 @@ from nlu.pipe.utils.pipe_utils import PipeUtils from nlu.pipe.utils.component_utils import ComponentUtils from sparknlp.base import * +from typing import Union +from nlu.pipe.utils.resolution.storage_ref_resolution_utils import * +from nlu.pipe.pipe_component import NLUComponent -def load_offline_model(path): - c = offline_utils.verify_and_create_model(path) - return c - - -def parse_language_from_nlu_ref(nlu_ref): - """Parse a ISO language identifier from a NLU reference which can be used to load a Spark NLP model""" - infos = nlu_ref.split('.') - for split in infos: - if split in nlu.AllComponentsInfo().all_languages: - logger.info(f'Parsed Nlu_ref={nlu_ref} as lang={split}') - return split - logger.info(f'Parsed Nlu_ref={nlu_ref} as lang=en') - return 'en' - - -def get_default_component_of_type(missing_component_type, language='en', is_licensed=False, is_trainable_pipe=False): +def get_default_component_of_type(missing_component_type, language='en', is_licensed=False, + is_trainable_pipe=False) -> NLUComponent: ''' - This function returns a default component for a missing component type. + This function returns a default component for a missing component type and core part to the pipeline resolution/ It is used to auto complete pipelines, which are missng required components. These represents defaults for many applications and should be set wisely. :param missing_component_type: String which is either just the component type or componenttype@spark_nlp_reference which stems from a models storageref and refers to some pretrained embeddings or model @@ -63,7 +33,7 @@ def get_default_component_of_type(missing_component_type, language='en', is_lice logger.info(f'Getting default for missing_component_type={missing_component_type}') if not '@' in missing_component_type: - # get default models if there is no @ in the model name included + # get default models if there is no @ in the model name included and ignore storage ref handling if missing_component_type == 'document': return Util('document_assembler', nlu_ref='document') if missing_component_type == 'sentence': return Util('deep_sentence_detector', nlu_ref='sentence') if missing_component_type == 'sentence_embeddings': return Embeddings('use', nlu_ref='embed_sentence.use') @@ -109,126 +79,11 @@ def get_default_component_of_type(missing_component_type, language='en', is_lice if 'pos' in missing_component_type or 'ner' in missing_component_type: return construct_component_from_identifier(language=language, component_type='classifier', nlp_ref=storage_ref) - # if 'unlabeled_dependency' in missing_component_type or 'dep.untyped' in missing_component_type: - # return UnlabledDepParser('dep.untyped') - # if 'labled_dependency' in missing_component_type or 'dep.typed' in missing_component_type: - # return LabledDepParser('dep.typed') - # if 'date' in missing_component_type: - # return None - - logger.exception("Could not resolve default component type for missing type=%s", missing_component_type) - - -def set_storage_ref_and_resolution_on_component_info(c, storage_ref): - """Sets a storage ref on a components component info and returns the component """ - c.info.storage_ref = storage_ref - - return c - - -def resolve_storage_ref(lang, storage_ref, missing_component_type): - """Returns a nlp_ref, nlu_ref and wether it is a licensed model or not and an updated languiage, if multi lingual""" - logger.info( - f"Resolving storage_ref={storage_ref} for lang={lang} and missing_component_type={missing_component_type}") - nlu_ref, nlp_ref, is_licensed = None, None, False - # get nlu ref - - # check if storage_ref is hardcoded - if lang in nlu.Spellbook.licensed_storage_ref_2_nlu_ref.keys() and storage_ref in \ - nlu.Spellbook.licensed_storage_ref_2_nlu_ref[lang].keys(): - nlu_ref = nlu.Spellbook.licensed_storage_ref_2_nlu_ref[lang][storage_ref] - is_licensed = True - elif lang in nlu.Spellbook.storage_ref_2_nlu_ref.keys() and storage_ref in nlu.Spellbook.storage_ref_2_nlu_ref[ - lang].keys(): - nlu_ref = nlu.Spellbook.storage_ref_2_nlu_ref[lang][ - storage_ref] # a HC model may use OS storage_ref_provider, so we dont know yet if it is licensed or not - - if lang in nlu.Spellbook.pretrained_models_references.keys() and nlu_ref in \ - nlu.Spellbook.pretrained_models_references[lang].keys(): - nlp_ref = nlu.Spellbook.pretrained_models_references[lang][nlu_ref] - elif lang in nlu.Spellbook.pretrained_healthcare_model_references.keys() and nlu_ref in \ - nlu.Spellbook.pretrained_healthcare_model_references[lang].keys(): - nlp_ref = nlu.Spellbook.pretrained_healthcare_model_references[lang][nlu_ref] - is_licensed = True - - - # check if storage_ref matches nlu_ref and get NLP_ref - elif lang in nlu.Spellbook.licensed_storage_ref_2_nlu_ref.keys() and storage_ref in \ - nlu.Spellbook.licensed_storage_ref_2_nlu_ref[lang].keys(): - nlu_ref = storage_ref - nlp_ref = nlu.Spellbook.licensed_storage_ref_2_nlu_ref[lang][nlu_ref] - elif lang in nlu.Spellbook.pretrained_models_references.keys() and storage_ref in \ - nlu.Spellbook.pretrained_models_references[lang].keys(): - nlu_ref = storage_ref - nlp_ref = nlu.Spellbook.pretrained_models_references[lang][nlu_ref] - - # check if storage_ref matches nlp_ref and get nlp and nlu ref - elif lang in nlu.Spellbook.pretrained_healthcare_model_references.keys(): - if storage_ref in nlu.Spellbook.pretrained_healthcare_model_references[lang].values(): - inv_namespace = {v: k for k, v in nlu.Spellbook.pretrained_healthcare_model_references[lang].items()} - nlp_ref = storage_ref - nlu_ref = inv_namespace[nlp_ref] - is_licensed = True - - if nlu_ref is not None and 'xx.' in nlu_ref: lang = 'xx' - - if nlp_ref is None and nlu_ref is not None: - # cast NLU ref to NLP ref - if is_licensed: - nlp_ref = Spellbook.pretrained_healthcare_model_references[lang][nlu_ref] - else: - nlp_ref = Spellbook.pretrained_models_references[lang][nlu_ref] - - if nlp_ref is not None and nlu_ref is None: - # cast NLP ref to NLU ref - if is_licensed: - inv_namespace = {v: k for k, v in nlu.Spellbook.pretrained_healthcare_model_references[lang].items()} - nlu_ref = inv_namespace[nlp_ref] - else: - inv_namespace = {v: k for k, v in nlu.Spellbook.pretrained_models_references[lang].items()} - nlu_ref = inv_namespace[nlp_ref] - - if nlu_ref == None and nlp_ref == None: - # todo enfore storage ref when trainin - logger.info(f"COULD NOT RESOLVE STORAGE_REF={storage_ref}") - if storage_ref == '': - if missing_component_type == 'sentence_embeddings': - logger.info("Using default storage_ref USE, assuming training mode") - storage_ref = 'en.embed_sentence.use' # this enables default USE embeds for traianble components - nlp_ref = 'tfhub_use' - nlu_ref = storage_ref - elif missing_component_type == 'word_embeddings': - logger.info("Using default storage_ref GLOVE, assuming training mode") - storage_ref = 'en.glove' # this enables default USE embeds for traianble components - nlp_ref = 'glove_100d' - nlu_ref = storage_ref - - else: - nlp_ref = storage_ref - nlu_ref = storage_ref - # raise ValueError - - if nlu_ref is not None: is_licensed = check_if_nlu_ref_is_licensed(nlu_ref) - - logger.info(f'Resolved storageref = {storage_ref} to NLU_ref = {nlu_ref} and NLP_ref = {nlp_ref}') - return nlu_ref, nlp_ref, is_licensed, lang - - -def check_if_nlu_ref_is_licensed(nlu_ref): - """check if a nlu_ref is pointing to a licensed or open source model. - This works by just checking if the NLU ref points to a healthcare model or not""" - for lang, universe in Spellbook.healthcare_component_alias_references.items(): - for hc_nlu_ref, hc_nlp_ref in universe.items(): - if hc_nlu_ref == nlu_ref: return True + logger.exception(f"Could not resolve default component type for missing type={missing_component_type}") - for lang, universe in Spellbook.pretrained_healthcare_model_references.items(): - for hc_nlu_ref, hc_nlp_ref in universe.items(): - if hc_nlu_ref == nlu_ref: return True - return False - - -def nlu_ref_to_component(nlu_reference, detect_lang=False, authenticated=False, is_recursive_call=False): +def nlu_ref_to_component(nlu_reference, detect_lang=False, authenticated=False, + is_recursive_call=False) -> NLUComponent: ''' This method implements the main namespace for all component names. It parses the input request and passes the data to a resolver method which searches the namespace for a Component for the input request It returns a list of NLU.component objects or just one NLU.component object alone if just one component was specified. @@ -292,10 +147,6 @@ def nlu_ref_to_component(nlu_reference, detect_lang=False, authenticated=False, dataset = infos[2] if len(infos) == 4: # embeddings specified component_embeddings = infos[3] - - - - # passing embed_sentence can have format embed_sentence.lang.embedding or embed_sentence.embedding # i.e. embed_sentence.bert # fr.embed_sentence.bert will automatically select french bert thus no embed_sentence.en.bert or simmilar is required @@ -316,8 +167,7 @@ def nlu_ref_to_component(nlu_reference, detect_lang=False, authenticated=False, component_embeddings = infos[1] logger.info( - 'For input nlu_ref %s detected : \n lang: %s , component type: %s , component dataset: %s , component embeddings %s ', - nlu_reference, language, component_type, dataset, component_embeddings) + f'For input nlu_ref {nlu_reference} detected : \n {language}: , component type: {component_type} , component dataset:{dataset}, component embeddings {component_embeddings} ') resolved_component = resolve_component_from_parsed_query_data(language, component_type, dataset, component_embeddings, nlu_reference, trainable, authenticated=authenticated, @@ -329,7 +179,8 @@ def nlu_ref_to_component(nlu_reference, detect_lang=False, authenticated=False, def resolve_component_from_parsed_query_data(lang, component_type, dataset, component_embeddings, nlu_ref, - trainable=False, path=None, authenticated=False, is_recursive_call=False): + trainable=False, path=None, authenticated=False, + is_recursive_call=False) -> Union[NLUComponent]: # NLUPipeline ''' Searches the NLU name spaces for a matching NLU reference. From that NLU reference, a SparkNLP reference will be aquired which resolved to a SparkNLP pretrained model or pipeline :param nlu_ref: Full request which was passed to nlu.load() @@ -480,7 +331,7 @@ def resolve_component_from_parsed_query_data(lang, component_type, dataset, comp raise ValueError(f'EXCEPTION : Could not create NLU component for nlp_ref={nlp_ref} and nlu_ref={nlu_ref}') -def construct_trainable_component_from_identifier(nlu_ref, nlp_ref, authenticated=False): +def construct_trainable_component_from_identifier(nlu_ref, nlp_ref, authenticated=False) -> NLUComponent: ''' This method returns a Spark NLP annotator Approach class embelished by a NLU component :param nlu_ref: nlu ref to the trainable model @@ -552,7 +403,8 @@ def construct_trainable_component_from_identifier(nlu_ref, nlp_ref, authenticate f'EXCEPTION: Could not create trainable NLU component for nlu_ref = {nlu_ref} and nlp_ref = {nlp_ref}') -def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=None, is_licensed=False, strict=False): +def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=None, is_licensed=False, + strict=False): # -> NLUPipeline ''' # creates a list of components from a Spark NLP Pipeline reference # 1. download pipeline @@ -580,7 +432,6 @@ def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=No constructed_components = [] # for component in pipe.light_model.pipeline_model.stages: for component in iterable_stages: - logger.info(f"Extracting model from Spark NLP pipeline: {component} and creating Component") parsed = str(component).split('_')[0].lower() logger.info(f"Parsed Component for : {parsed}") @@ -648,8 +499,6 @@ def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=No constructed_components.append( nlu.Classifier(model=component, language=language, nlu_ref=nlu_ref, nlp_ref=nlp_ref, loaded_from_pretrained_pipe=True)) - - elif isinstance(component, Chunker): constructed_components.append( nlu.chunker.Chunker(annotator_class='default_chunker', model=component, lang=language, nlu_ref=nlu_ref, @@ -659,51 +508,67 @@ def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=No embeddings_chunker.EmbeddingsChunker(annotator_class='chunk_embedder', model=component, lang=language, nlu_ref=nlu_ref, nlp_ref=nlp_ref, loaded_from_pretrained_pipe=True)) - - - elif isinstance(component, RegexMatcherModel) or parsed == 'match': - constructed_components.append(nlu.Matcher(model=component, annotator_class='regex', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append(nlu.Matcher(model=component, annotator_class='regex', nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif isinstance(component, TextMatcherModel): - constructed_components.append(nlu.Matcher(model=component, annotator_class='text', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append( + nlu.Matcher(model=component, annotator_class='text', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) elif isinstance(component, DateMatcher): - constructed_components.append(nlu.Matcher(model=component, annotator_class='date', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append( + nlu.Matcher(model=component, annotator_class='date', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) elif isinstance(component, ContextSpellCheckerModel): - constructed_components.append(nlu.SpellChecker(model=component, annotator_class='context', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append(nlu.SpellChecker(model=component, annotator_class='context', nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif isinstance(component, SymmetricDeleteModel): constructed_components.append( - nlu.SpellChecker(model=component, annotator_class='symmetric', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + nlu.SpellChecker(model=component, annotator_class='symmetric', nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif isinstance(component, NorvigSweetingModel): - constructed_components.append(nlu.SpellChecker(model=component, annotator_class='norvig', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append(nlu.SpellChecker(model=component, annotator_class='norvig_spell', nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif isinstance(component, LemmatizerModel): - constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append( + nlu.lemmatizer.Lemmatizer(model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) elif isinstance(component, NormalizerModel): - constructed_components.append(nlu.normalizer.Normalizer(model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append( + nlu.normalizer.Normalizer(model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) elif isinstance(component, Stemmer): - constructed_components.append(nlu.stemmer.Stemmer(model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append( + nlu.stemmer.Stemmer(model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) elif isinstance(component, (NerDLModel, NerCrfModel)): constructed_components.append( - nlu.Classifier(model=component, annotator_class='ner', language=language, nlu_ref=nlu_ref, nlp_ref=nlp_ref, + nlu.Classifier(model=component, annotator_class='ner', language=language, nlu_ref=nlu_ref, + nlp_ref=nlp_ref, loaded_from_pretrained_pipe=True)) elif isinstance(component, LanguageDetectorDL): constructed_components.append( - nlu.Classifier(model=component, annotator_class='language_detector', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + nlu.Classifier(model=component, annotator_class='language_detector', nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif isinstance(component, DependencyParserModel): constructed_components.append( UnlabledDepParser(model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + elif isinstance(component, Doc2VecModel): + constructed_components.append( + nlu.Embeddings(model=component, annotator_class='doc2vec', lang=language, nlu_ref=nlu_ref, + nlp_ref=nlp_ref, loaded_from_pretrained_pipe=True, do_ref_checks=False)) elif isinstance(component, TypedDependencyParserModel): constructed_components.append( LabledDepParser(model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True, )) elif isinstance(component, MultiClassifierDLModel): - constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl', nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif isinstance(component, (SentimentDetectorModel, SentimentDLModel)): - constructed_components.append(nlu.Classifier(model=component, nlp_ref='sentimentdl', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append(nlu.Classifier(model=component, nlp_ref='sentimentdl', nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif isinstance(component, (SentimentDetectorModel, ViveknSentimentModel)): - constructed_components.append(nlu.Classifier(model=component, nlp_ref='vivekn', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append( + nlu.Classifier(model=component, nlp_ref='vivekn', nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) elif isinstance(component, NGram): constructed_components.append( - nlu.chunker.Chunker(annotator_class='ngram', model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + nlu.chunker.Chunker(annotator_class='ngram', model=component, nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif isinstance(component, StopWordsCleaner): from nlu.components.stopwordscleaner import StopWordsCleaner as Stopw constructed_components.append(Stopw(annotator_class='Stopw', model=component, nlu_ref=nlu_ref)) @@ -712,12 +577,15 @@ def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=No constructed_components.append( nlu.Matcher(model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) elif isinstance(component, (T5Transformer)): - constructed_components.append(nlu.Seq2Seq(annotator_class='t5', model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append( + nlu.Seq2Seq(annotator_class='t5', model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) elif isinstance(component, (MarianTransformer)): - constructed_components.append(nlu.Seq2Seq(annotator_class='marian', model=component, nlu_ref=nlu_ref, loaded_from_pretrained_pipe=True)) + constructed_components.append(nlu.Seq2Seq(annotator_class='marian', model=component, nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif isinstance(component, SentenceEmbeddings): - constructed_components.append(Util(annotator_class='sentence_embeddings', model=component, nlu_ref=nlu_ref,loaded_from_pretrained_pipe=True )) + constructed_components.append(Util(annotator_class='sentence_embeddings', model=component, nlu_ref=nlu_ref, + loaded_from_pretrained_pipe=True)) elif parsed in Spellbook.word_embeddings + Spellbook.sentence_embeddings: constructed_components.append( nlu.Embeddings(model=component, lang=language, nlu_ref=nlu_ref, nlp_ref=nlp_ref, @@ -729,8 +597,6 @@ def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=No ChunkFilterer from sparknlp_jsl.annotator import ChunkMergeModel, ContextualParserModel, DeIdentificationModel, \ DocumentLogRegClassifierModel, DrugNormalizer - from sparknlp_jsl.annotator import GenericClassifierModel, IOBTagger, NerChunker, NerConverterInternal, \ - NerDisambiguatorModel, ReIdentification from sparknlp_jsl.annotator import MedicalNerModel, RelationExtractionModel, RelationExtractionDLModel, \ RENerChunksFilter, SentenceEntityResolverModel # todo embelish ChunkFilterer, Chunk2Token, Disambeguiate, DrugNormalizer, RENerChunksfilterer, IOBTager(???)_, ReIdentify,, NERChunker @@ -781,7 +647,8 @@ def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=No f"Could not infer component type for lang={language} and nlp_ref={nlp_ref} of type={component} during pipeline conversion ") logger.warning( f"Warning: Could not infer component type for lang={language} and nlp_ref={nlp_ref} and model {component} during pipeline conversion, using default type Normalizer") - constructed_components.append(nlu.normalizer.Normalizer(model=component, loaded_from_pretrained_pipe=True)) + constructed_components.append( + nlu.normalizer.Normalizer(model=component, loaded_from_pretrained_pipe=True)) else: if strict: raise Exception( f"Could not infer component type for lang={language} and nlp_ref={nlp_ref} of type={component} during pipeline conversion ") @@ -790,18 +657,16 @@ def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=No constructed_components.append(nlu.normalizer.Normalizer(model=component, loaded_from_pretrained_pipe=True)) logger.info(f"Extracted into NLU Component type : {parsed}", ) - if None in constructed_components: raise Exception(f"Could not infer component type for lang={language} and nlp_ref={nlp_ref} during pipeline conversion,") - - # TODO update input/output cols on Annotators - return ComponentUtils.set_storage_ref_attribute_of_embedding_converters(PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components, nlp_ref, language,path)) - # return PipeUtils.enforece_AT_embedding_provider_output_col_name_schema_for_list_of_components( - # ComponentUtils.set_storage_ref_attribute_of_embedding_converters(constructed_components)) + if None in constructed_components: raise Exception( + f"Could not infer component type for lang={language} and nlp_ref={nlp_ref} during pipeline conversion,") + return ComponentUtils.set_storage_ref_attribute_of_embedding_converters( + PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components, nlp_ref, language, path)) def construct_component_from_identifier(language, component_type='', dataset='', component_embeddings='', nlu_ref='', - nlp_ref='', is_licensed=False): + nlp_ref='', is_licensed=False) -> NLUComponent: ''' - Creates a NLU component from a pretrained SparkNLP model reference or Class reference. + Creates a NLU component from a pretrained SparkNLP model reference or Class reference. First step to get the Root of the NLP DAG Class references will return default pretrained models :param language: Language of the sparknlp model reference :param component_type: Class which will be used to instantiate the model @@ -811,7 +676,6 @@ def construct_component_from_identifier(language, component_type='', dataset='', :param nlp_ref: Full Spark NLP reference :return: Returns a NLU component which embelished the Spark NLP pretrained model and class for that model ''' - logger.info( f'Creating singular NLU component for type={component_type} sparknlp_ref={nlp_ref} , nlu_ref={nlu_ref} dataset={dataset}, language={language} ') try: @@ -833,7 +697,6 @@ def construct_component_from_identifier(language, component_type='', dataset='', configs=dataset, is_licensed=is_licensed) # if any([component_type in NameSpace.word_embeddings,dataset in NameSpace.word_embeddings, nlu_ref in NameSpace.word_embeddings, nlp_ref in NameSpace.word_embeddings]): - # TODO new CATEGRRY for token classifiers or smth else?? elif any( x in Spellbook.classifiers for x in [nlp_ref, nlu_ref, dataset, component_type, ] + dataset.split('_')): return Classifier(get_default=False, nlp_ref=nlp_ref, nlu_ref=nlu_ref, language=language, @@ -907,17 +770,3 @@ def construct_component_from_identifier(language, component_type='', dataset='', f'EXCEPTION: Could not resolve singular Component for type={component_type} and nlp_ref={nlp_ref} and nlu_ref={nlu_ref} and lang ={language} ') return None # raise ValueError - - -def extract_classifier_metadata_from_nlu_ref(nlu_ref): - ''' - Extract classifier and metadataname from nlu reference which is handy for deciding what output column names should be - Strips lang and action from nlu_ref and returns a list of remaining identifiers, i.e [,, - :param nlu_ref: nlu reference from which to extra model meta data - :return: [, , ,] . For pure actions this will return [] - ''' - model_infos = [] - for e in nlu_ref.split('.'): - if e in nlu.all_components_info.all_languages or e in nlu.spellbook.Spellbook.actions: continue - model_infos.append(e) - return model_infos diff --git a/nlu/pipe/extractors/extraction_resolver_HC.py b/nlu/pipe/extractors/extraction_resolver_HC.py index 5891bf72..74e8e203 100644 --- a/nlu/pipe/extractors/extraction_resolver_HC.py +++ b/nlu/pipe/extractors/extraction_resolver_HC.py @@ -9,8 +9,8 @@ If a document has multi-sentences, this will map a label back to a corrosponding sentence """ -from nlu.pipe.extractors.extractor_configs_open_source import * -from nlu.pipe.extractors.extractor_configs_healthcare import * +from nlu.pipe.extractors.extractor_configs_OS import * +from nlu.pipe.extractors.extractor_configs_HC import * from sparknlp_jsl.annotator import * from sparknlp_jsl.base import * diff --git a/nlu/pipe/extractors/extraction_resolver_OS.py b/nlu/pipe/extractors/extraction_resolver_OS.py index bee5b04b..7f94d736 100644 --- a/nlu/pipe/extractors/extraction_resolver_OS.py +++ b/nlu/pipe/extractors/extraction_resolver_OS.py @@ -11,7 +11,7 @@ """ from sparknlp.annotator import * from sparknlp.base import * -from nlu.pipe.extractors.extractor_configs_open_source import * +from nlu.pipe.extractors.extractor_configs_OS import * OS_anno2config = { NerConverter: { @@ -36,6 +36,15 @@ 'default_full': default_full_config, }, + BertForSequenceClassification : { + 'default': default_classifier_dl_config, + 'default_full': default_full_config, + }, + + DistilBertForSequenceClassification : { + 'default': default_classifier_dl_config, + 'default_full': default_full_config, + }, BertForTokenClassification: { @@ -115,6 +124,15 @@ 'default': default_sentence_embedding_config, 'default_full': default_full_config, }, + Doc2VecModel: { + 'default': default_sentence_embedding_config, + 'default_full': default_full_config, + }, + Doc2VecApproach: { + 'default': default_sentence_embedding_config, + 'default_full': default_full_config, + }, + UniversalSentenceEncoder: { 'default': default_sentence_embedding_config, 'default_full': default_full_config, diff --git a/nlu/pipe/extractors/extractor_configs_healthcare.py b/nlu/pipe/extractors/extractor_configs_HC.py similarity index 100% rename from nlu/pipe/extractors/extractor_configs_healthcare.py rename to nlu/pipe/extractors/extractor_configs_HC.py diff --git a/nlu/pipe/extractors/extractor_configs_open_source.py b/nlu/pipe/extractors/extractor_configs_OS.py similarity index 100% rename from nlu/pipe/extractors/extractor_configs_open_source.py rename to nlu/pipe/extractors/extractor_configs_OS.py diff --git a/nlu/pipe/pipe_components.py b/nlu/pipe/pipe_component.py similarity index 78% rename from nlu/pipe/pipe_components.py rename to nlu/pipe/pipe_component.py index fcc3cecd..743c29be 100644 --- a/nlu/pipe/pipe_components.py +++ b/nlu/pipe/pipe_component.py @@ -1,5 +1,7 @@ # from nlu import * import nlu + + class NLUComponent(): ''' This class loads all the components in the components folder. @@ -27,15 +29,16 @@ def info(self): self.print_parameters() -class SparkNLUComponent(NLUComponent): - def __init__(self, component_name, component_type, nlu_ref='', nlp_ref='',lang='',loaded_from_pretrained_pipe=False, is_licensed=False): +class SparkNLUComponent(NLUComponent): + def __init__(self, component_name, component_type, nlu_ref='', nlp_ref='', lang='', + loaded_from_pretrained_pipe=False, is_licensed=False): NLUComponent.__init__(self, component_name, component_type) self.info.nlu_ref = nlu_ref self.info.nlp_ref = nlp_ref - self.info.lang = lang + self.info.lang = lang self.info.loaded_from_pretrained_pipe = loaded_from_pretrained_pipe self.__set_missing_model_attributes__() - if is_licensed : self.info.license = 'healthcare' + if is_licensed: self.info.license = 'healthcare' def __set_missing_model_attributes__(self): ''' @@ -46,34 +49,30 @@ def __set_missing_model_attributes__(self): ''' for k in self.model.extractParamMap(): if "inputCol" in str(k): - if isinstance(self.model.extractParamMap()[k], str) : - if self.model.extractParamMap()[k] == 'embeddings': # swap name so we have uniform col names - self.model.setInputCols( 'word_embeddings') - self.info.spark_input_column_names = [self.model.extractParamMap()[k]] - else : - if 'embeddings' in self.model.extractParamMap()[k]: # swap name so we have uniform col names + if isinstance(self.model.extractParamMap()[k], str): + if self.model.extractParamMap()[k] == 'embeddings': # swap name so we have uniform col names + self.model.setInputCols('word_embeddings') + self.info.spark_input_column_names = [self.model.extractParamMap()[k]] + else: + if 'embeddings' in self.model.extractParamMap()[k]: # swap name so we have uniform col names new_cols = self.model.extractParamMap()[k] new_cols.remove("embeddings") new_cols.append("word_embeddings") self.model.setInputCols(new_cols) - self.info.spark_input_column_names = self.model.extractParamMap()[k] + self.info.spark_input_column_names = self.model.extractParamMap()[k] if "outputCol" in str(k): - if isinstance(self.model.extractParamMap()[k], str) : - if self.model.extractParamMap()[k] == 'embeddings': # swap name so we have uniform col names - self.model.setOutputCol( 'word_embeddings') - self.info.spark_output_column_names = [self.model.extractParamMap()[k]] - else : - if 'embeddings' in self.model.extractParamMap()[k]: # swap name so we have uniform col names + if isinstance(self.model.extractParamMap()[k], str): + if self.model.extractParamMap()[k] == 'embeddings': # swap name so we have uniform col names + self.model.setOutputCol('word_embeddings') + self.info.spark_output_column_names = [self.model.extractParamMap()[k]] + else: + if 'embeddings' in self.model.extractParamMap()[k]: # swap name so we have uniform col names new_cols = self.model.extractParamMap()[k] new_cols.remove("embeddings") new_cols.append("word_embeddings") self.model.setOutputCol(new_cols) - self.info.spark_output_column_names = self.model.extractParamMap()[k] - - - - + self.info.spark_output_column_names = self.model.extractParamMap()[k] # if "labelCol" in str(k): # if isinstance(self.model.extractParamMap()[k], str) : diff --git a/nlu/pipe/pipe_logic.py b/nlu/pipe/pipe_logic.py index 18b360ab..b9041547 100644 --- a/nlu/pipe/pipe_logic.py +++ b/nlu/pipe/pipe_logic.py @@ -2,11 +2,10 @@ import logging logger = logging.getLogger('nlu') -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent from nlu.pipe.utils.pipe_utils import PipeUtils from nlu.pipe.utils.component_utils import ComponentUtils -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils - +from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils from dataclasses import dataclass from nlu.pipe.component_resolution import get_default_component_of_type @@ -649,7 +648,6 @@ def enforce_chunk2doc_on_sentence_embeddings(pipe): """ if not pipe.has_licensed_components: return pipe from sparknlp_jsl.annotator import SentenceEntityResolverModel, NerConverter, NerConverterInternal - from sparknlp.base import Chunk2Doc resolvers = [] ner_converters = [] sentence_embeddings = [] diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py index 42099d17..1c17d1cd 100644 --- a/nlu/pipe/pipeline.py +++ b/nlu/pipe/pipeline.py @@ -1,13 +1,10 @@ import logging - logger = logging.getLogger('nlu') - from nlu.pipe.extractors.extraction_resolver_OS import OS_anno2config from nlu.pipe.extractors.extractor_methods.base_extractor_methods import * from nlu.pipe.pipe_logic import PipeUtils -import nlu -import nlu.pipe.pipe_components +import nlu.pipe.pipe_component import sparknlp from typing import List, Union @@ -18,8 +15,8 @@ import pandas as pd import numpy as np from pyspark.sql.types import StructType, StructField, StringType -from nlu.pipe.component_resolution import extract_classifier_metadata_from_nlu_ref -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils +from nlu.pipe.utils.resolution.nlu_ref_utils import extract_classifier_metadata_from_nlu_ref +from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils from nlu.pipe.utils.component_utils import ComponentUtils from nlu.pipe.utils.output_level_resolution_utils import OutputLevelUtils from nlu.pipe.utils.data_conversion_utils import DataConversionUtils diff --git a/nlu/pipe/utils/component_utils.py b/nlu/pipe/utils/component_utils.py index 1b2c5399..9e284431 100644 --- a/nlu/pipe/utils/component_utils.py +++ b/nlu/pipe/utils/component_utils.py @@ -2,8 +2,8 @@ import logging logger = logging.getLogger('nlu') import inspect -from nlu.pipe.pipe_components import SparkNLUComponent -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils +from nlu.pipe.pipe_component import SparkNLUComponent +from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils class ComponentUtils(): """Component and Column Level logic operations and utils""" diff --git a/nlu/pipe/utils/output_level_resolution_utils.py b/nlu/pipe/utils/output_level_resolution_utils.py index 26eea10f..2db37138 100644 --- a/nlu/pipe/utils/output_level_resolution_utils.py +++ b/nlu/pipe/utils/output_level_resolution_utils.py @@ -24,7 +24,7 @@ class OutputLevelUtils(): 'document': [DocumentAssembler, Chunk2Doc, YakeKeywordExtraction,DocumentNormalizer ], - 'sentence': [SentenceDetector, SentenceDetectorDLApproach, ], + 'sentence': [SentenceDetector, SentenceDetectorDLApproach ], 'chunk': [Chunker, ChunkEmbeddings, ChunkTokenizer, Token2Chunk, TokenAssembler, NerConverter, Doc2Chunk,NGramGenerator], 'token': [ NerCrfApproach, NerDLApproach, @@ -41,7 +41,7 @@ class OutputLevelUtils(): # these can be document or sentence 'input_dependent': [ViveknSentimentApproach, SentimentDLApproach, ClassifierDLApproach, LanguageDetectorDL, - MultiClassifierDLApproach, SentenceEmbeddings, NorvigSweetingApproach,], + MultiClassifierDLApproach, SentenceEmbeddings, NorvigSweetingApproach,BertForSequenceClassification, DistilBertForTokenClassification,], 'multi' : [MultiClassifierDLApproach, SentenceEmbeddings, NorvigSweetingApproach,] # 'unclassified': [Yake, Ngram] } diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py index ae3efac7..da4e200f 100644 --- a/nlu/pipe/utils/pipe_utils.py +++ b/nlu/pipe/utils/pipe_utils.py @@ -23,7 +23,6 @@ def set_column_values_on_components_from_pretrained_pipe(pipe, nlp_ref, lang, pa else: pipe_path = os.path.expanduser('~') + '/cache_pretrained/' + f'{nlp_ref}_{lang}' # WE do not need to check for Spark Version, since cols should match accors versions - # TODO but what about LOCAL pipes!!!! todo fix pipe_path = glob.glob(f'{pipe_path}*')[0] if not os.path.exists(pipe_path): raise FileNotFoundError( f"Could not find downloaded Pipeline at path={pipe_path}") @@ -32,7 +31,6 @@ def set_column_values_on_components_from_pretrained_pipe(pipe, nlp_ref, lang, pa digits_num = len(str(len(pipe))) digit_str = '0' * digits_num digit_cur = 0 - for c in pipe: # c_metadata_path = f'{pipe_path}/stages/{digit_str}_*/metadata/part-00000' c_metadata_path = f'{pipe_path}/stages/{digit_str}_*/metadata/part-00000' @@ -50,7 +48,6 @@ def set_column_values_on_components_from_pretrained_pipe(pipe, nlp_ref, lang, pa c.info.spark_input_column_names = inp if isinstance(inp, List) else [inp] c.info.spark_output_column_names = [out] c.model.setOutputCol(out) - digit_cur += 1 digit_str = str(digit_cur) while len(digit_str) < digits_num: diff --git a/nlu/pipe/utils/resolution/__init__.py b/nlu/pipe/utils/resolution/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/pipe/utils/resolution/nlu_ref_utils.py b/nlu/pipe/utils/resolution/nlu_ref_utils.py new file mode 100644 index 00000000..01037320 --- /dev/null +++ b/nlu/pipe/utils/resolution/nlu_ref_utils.py @@ -0,0 +1,40 @@ +from nlu import Spellbook, logger +from nlu.info import AllComponentsInfo + + +def check_if_nlu_ref_is_licensed(nlu_ref): + """check if a nlu_ref is pointing to a licensed or open source model. + This works by just checking if the NLU ref points to a healthcare model or not""" + for lang, universe in Spellbook.healthcare_component_alias_references.items(): + for hc_nlu_ref, hc_nlp_ref in universe.items(): + if hc_nlu_ref == nlu_ref: return True + for lang, universe in Spellbook.pretrained_healthcare_model_references.items(): + for hc_nlu_ref, hc_nlp_ref in universe.items(): + if hc_nlu_ref == nlu_ref: return True + + return False + + +def parse_language_from_nlu_ref(nlu_ref): + """Parse a ISO language identifier from a NLU reference which can be used to load a Spark NLP model""" + infos = nlu_ref.split('.') + for split in infos: + if split in AllComponentsInfo().all_languages: + logger.info(f'Parsed Nlu_ref={nlu_ref} as lang={split}') + return split + logger.info(f'Parsed Nlu_ref={nlu_ref} as lang=en') + return 'en' + + +def extract_classifier_metadata_from_nlu_ref(nlu_ref): + ''' + Extract classifier and metadataname from nlu reference which is handy for deciding what output column names should be + Strips lang and action from nlu_ref and returns a list of remaining identifiers, i.e [,, + :param nlu_ref: nlu reference from which to extra model meta data + :return: [, , ,] . For pure actions this will return [] + ''' + model_infos = [] + for e in nlu_ref.split('.'): + if e in AllComponentsInfo().all_languages or e in Spellbook.actions: continue + model_infos.append(e) + return model_infos \ No newline at end of file diff --git a/nlu/pipe/utils/resolution/storage_ref_resolution_utils.py b/nlu/pipe/utils/resolution/storage_ref_resolution_utils.py new file mode 100644 index 00000000..097b1187 --- /dev/null +++ b/nlu/pipe/utils/resolution/storage_ref_resolution_utils.py @@ -0,0 +1,96 @@ +import logging +from nlu.pipe.utils.resolution.nlu_ref_utils import * +from nlu import Spellbook + +logger = logging.getLogger('nlu') + + +def resolve_storage_ref(lang, storage_ref, missing_component_type): + """Returns a nlp_ref, nlu_ref and wether it is a licensed model or not and an updated languiage, if multi lingual""" + logger.info( + f"Resolving storage_ref={storage_ref} for lang={lang} and missing_component_type={missing_component_type}") + nlu_ref, nlp_ref, is_licensed = None, None, False + # get nlu ref + + # check if storage_ref is hardcoded + if lang in Spellbook.licensed_storage_ref_2_nlu_ref.keys() and storage_ref in \ + Spellbook.licensed_storage_ref_2_nlu_ref[lang].keys(): + nlu_ref = Spellbook.licensed_storage_ref_2_nlu_ref[lang][storage_ref] + is_licensed = True + elif lang in Spellbook.storage_ref_2_nlu_ref.keys() and storage_ref in Spellbook.storage_ref_2_nlu_ref[ + lang].keys(): + nlu_ref = Spellbook.storage_ref_2_nlu_ref[lang][ + storage_ref] # a HC model may use OS storage_ref_provider, so we dont know yet if it is licensed or not + if lang in Spellbook.pretrained_models_references.keys() and nlu_ref in \ + Spellbook.pretrained_models_references[lang].keys(): + nlp_ref = Spellbook.pretrained_models_references[lang][nlu_ref] + elif lang in Spellbook.pretrained_healthcare_model_references.keys() and nlu_ref in \ + Spellbook.pretrained_healthcare_model_references[lang].keys(): + nlp_ref = Spellbook.pretrained_healthcare_model_references[lang][nlu_ref] + is_licensed = True + # check if storage_ref matches nlu_ref and get NLP_ref + elif lang in Spellbook.licensed_storage_ref_2_nlu_ref.keys() and storage_ref in \ + Spellbook.licensed_storage_ref_2_nlu_ref[lang].keys(): + nlu_ref = storage_ref + nlp_ref = Spellbook.licensed_storage_ref_2_nlu_ref[lang][nlu_ref] + elif lang in Spellbook.pretrained_models_references.keys() and storage_ref in \ + Spellbook.pretrained_models_references[lang].keys(): + nlu_ref = storage_ref + nlp_ref = Spellbook.pretrained_models_references[lang][nlu_ref] + + # check if storage_ref matches nlp_ref and get nlp and nlu ref + elif lang in Spellbook.pretrained_healthcare_model_references.keys(): + if storage_ref in Spellbook.pretrained_healthcare_model_references[lang].values(): + inv_namespace = {v: k for k, v in Spellbook.pretrained_healthcare_model_references[lang].items()} + nlp_ref = storage_ref + nlu_ref = inv_namespace[nlp_ref] + is_licensed = True + + if nlu_ref is not None and 'xx.' in nlu_ref: lang = 'xx' + + if nlp_ref is None and nlu_ref is not None: + # cast NLU ref to NLP ref + if is_licensed: + nlp_ref = Spellbook.pretrained_healthcare_model_references[lang][nlu_ref] + else: + nlp_ref = Spellbook.pretrained_models_references[lang][nlu_ref] + + if nlp_ref is not None and nlu_ref is None: + # cast NLP ref to NLU ref + if is_licensed: + inv_namespace = {v: k for k, v in Spellbook.pretrained_healthcare_model_references[lang].items()} + nlu_ref = inv_namespace[nlp_ref] + else: + inv_namespace = {v: k for k, v in Spellbook.pretrained_models_references[lang].items()} + nlu_ref = inv_namespace[nlp_ref] + + if nlu_ref == None and nlp_ref == None: + # todo enfore storage ref when trainin + logger.info(f"COULD NOT RESOLVE STORAGE_REF={storage_ref}") + if storage_ref == '': + if missing_component_type == 'sentence_embeddings': + logger.info("Using default storage_ref USE, assuming training mode") + storage_ref = 'en.embed_sentence.use' # this enables default USE embeds for traianble components + nlp_ref = 'tfhub_use' + nlu_ref = storage_ref + elif missing_component_type == 'word_embeddings': + logger.info("Using default storage_ref GLOVE, assuming training mode") + storage_ref = 'en.glove' # this enables default USE embeds for traianble components + nlp_ref = 'glove_100d' + nlu_ref = storage_ref + + else: + nlp_ref = storage_ref + nlu_ref = storage_ref + # raise ValueError + + if nlu_ref is not None: is_licensed = check_if_nlu_ref_is_licensed(nlu_ref) + + logger.info(f'Resolved storageref = {storage_ref} to NLU_ref = {nlu_ref} and NLP_ref = {nlp_ref}') + return nlu_ref, nlp_ref, is_licensed, lang + + +def set_storage_ref_and_resolution_on_component_info(c, storage_ref): + """Sets a storage ref on a components component info and returns the component """ + c.info.storage_ref = storage_ref + return c \ No newline at end of file diff --git a/nlu/pipe/utils/storage_ref_utils.py b/nlu/pipe/utils/resolution/storage_ref_utils.py similarity index 97% rename from nlu/pipe/utils/storage_ref_utils.py rename to nlu/pipe/utils/resolution/storage_ref_utils.py index cdcaa090..f05d4bf7 100644 --- a/nlu/pipe/utils/storage_ref_utils.py +++ b/nlu/pipe/utils/resolution/storage_ref_utils.py @@ -1,5 +1,6 @@ import logging -from nlu.pipe.utils import uid_to_storageref as uid2storageref +from nlu.pipe.utils.resolution import uid_to_storage_ref as uid2storageref + logger = logging.getLogger('nlu') @@ -10,7 +11,6 @@ def has_storage_ref(component): """Storage ref is either on the model or nlu component defined """ return StorageRefUtils.has_component_storage_ref_or_anno_storage_ref(component) - @staticmethod def extract_storage_ref(component, prefer_anno=False): """Extract storage ref from either a NLU component or NLP Annotator. First cheks if annotator has storage ref, otherwise check NLU attribute""" @@ -30,7 +30,6 @@ def fallback_storage_ref_resolutions(storage_ref): return uid2storageref.mappings[storage_ref] else : return storage_ref - ### SUB HELPERS @staticmethod @@ -39,7 +38,6 @@ def has_component_storage_ref_or_anno_storage_ref(component): if StorageRefUtils.nlp_component_has_storage_ref(component.model): return True if StorageRefUtils.nlu_component_has_storage_ref(component) : return True - @staticmethod def nlp_component_has_storage_ref(model): """Check if a storage ref is defined on the Spark NLP Annotator model""" @@ -47,7 +45,6 @@ def nlp_component_has_storage_ref(model): if k.name == 'storageRef': return True return False - @staticmethod def extract_storage_ref_from_component(component): """Extract storage ref from a NLU component which embelished a Spark NLP Annotator""" @@ -58,7 +55,6 @@ def extract_storage_ref_from_component(component): else: return '' - @staticmethod def nlu_extract_storage_ref_nlp_model(component): """Extract storage ref from a NLU component which embelished a Spark NLP Annotator""" @@ -76,3 +72,6 @@ def nlp_extract_storage_ref_nlp_model(model): return model.extractParamMap()[model.model.getParam('storageRef')] + + + diff --git a/nlu/pipe/utils/uid_to_storageref.py b/nlu/pipe/utils/resolution/uid_to_storage_ref.py similarity index 100% rename from nlu/pipe/utils/uid_to_storageref.py rename to nlu/pipe/utils/resolution/uid_to_storage_ref.py diff --git a/nlu/pipe/viz/streamlit_viz/streamlit_dashboard_OS.py b/nlu/pipe/viz/streamlit_viz/streamlit_dashboard_OS.py index f8af4a38..316e8ed5 100644 --- a/nlu/pipe/viz/streamlit_viz/streamlit_dashboard_OS.py +++ b/nlu/pipe/viz/streamlit_viz/streamlit_dashboard_OS.py @@ -1,15 +1,9 @@ -import nlu from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils from typing import List, Tuple, Optional, Dict, Union import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils -import numpy as np import pandas as pd from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS -from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random from nlu.pipe.viz.streamlit_viz.streamlit_viz_tracker import StreamlitVizTracker from nlu.pipe.viz.streamlit_viz.viz_building_blocks.dep_tree import DepTreeStreamlitBlock from nlu.pipe.viz.streamlit_viz.viz_building_blocks.classifier import ClassifierStreamlitBlock diff --git a/nlu/pipe/viz/streamlit_viz/streamlit_utils_OS.py b/nlu/pipe/viz/streamlit_viz/streamlit_utils_OS.py index 549d32ca..a9081c39 100644 --- a/nlu/pipe/viz/streamlit_viz/streamlit_utils_OS.py +++ b/nlu/pipe/viz/streamlit_viz/streamlit_utils_OS.py @@ -1,12 +1,7 @@ -from sparknlp.annotator import NerConverter,DependencyParserModel -from typing import List, Tuple, Optional, Dict import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils -import numpy as np -import pandas as pd from sparknlp.annotator import * import nlu -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils +from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils class StreamlitUtilsOS(): classifers_OS = [ ClassifierDLModel, LanguageDetectorDL, MultiClassifierDLModel, NerDLModel, NerCrfModel, YakeKeywordExtraction, PerceptronModel, SentimentDLModel, diff --git a/nlu/pipe/viz/streamlit_viz/streamlit_viz_tracker.py b/nlu/pipe/viz/streamlit_viz/streamlit_viz_tracker.py index ca534e8f..cfb3f841 100644 --- a/nlu/pipe/viz/streamlit_viz/streamlit_viz_tracker.py +++ b/nlu/pipe/viz/streamlit_viz/streamlit_viz_tracker.py @@ -1,15 +1,9 @@ -import nlu -from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils -from typing import List, Tuple, Optional, Dict, Union import streamlit as st from nlu.utils.modelhub.modelhub_utils import ModelHubUtils -import numpy as np -import pandas as pd from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS -from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random + + class StreamlitVizTracker(): """Track the status of the visualizations and models loaded in the Streamlit Web View. This is the Model part of the MVC pattern""" _set_block_container_style() diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py index 7493565b..be7cec82 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py @@ -1,12 +1,5 @@ -from sparknlp.annotator import NerConverter,DependencyParserModel -from typing import List, Tuple, Optional, Dict import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils -import numpy as np -import pandas as pd from sparknlp.annotator import * -import nlu -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils from nlu.components import embeddings_chunker class EntityManifoldUtils(): @@ -71,7 +64,7 @@ def get_ner_cols(df): @staticmethod def find_entity_embed_col_pd(df, search_multi=False): """Find col that contains embed in pandas df """ - if not search_multi: # TODO TEST + if not search_multi: for c in df.columns: if 'embed_entitiy'in c : return c else: diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/classifier.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/classifier.py index 3dabea23..986f2b7e 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/classifier.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/classifier.py @@ -1,15 +1,11 @@ import nlu from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils -from typing import List, Tuple, Optional, Dict, Union +from typing import List, Optional, Union import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils -import numpy as np import pandas as pd from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random from nlu.pipe.viz.streamlit_viz.streamlit_viz_tracker import StreamlitVizTracker class ClassifierStreamlitBlock(): diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/dep_tree.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/dep_tree.py index 9d4cc8ba..177e114c 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/dep_tree.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/dep_tree.py @@ -1,15 +1,8 @@ -import nlu -from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils -from typing import List, Tuple, Optional, Dict, Union +from typing import Optional import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils -import numpy as np -import pandas as pd from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random from nlu.pipe.viz.streamlit_viz.streamlit_viz_tracker import StreamlitVizTracker diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/entity_embedding_manifold.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/entity_embedding_manifold.py index cdd514dc..a5c1fe26 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/entity_embedding_manifold.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/entity_embedding_manifold.py @@ -1,15 +1,11 @@ import nlu from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils -from typing import List, Tuple, Optional, Dict, Union +from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils +from typing import List, Optional import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils import numpy as np import pandas as pd -from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS -from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random from nlu.pipe.viz.streamlit_viz.streamlit_viz_tracker import StreamlitVizTracker from nlu.pipe.viz.streamlit_viz.viz_building_blocks.block_utils.entity_manifold_utils import EntityManifoldUtils diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/ner.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/ner.py index ee33078b..285454fc 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/ner.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/ner.py @@ -1,15 +1,9 @@ -import nlu from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils -from typing import List, Tuple, Optional, Dict, Union +from typing import List, Optional, Dict import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils -import numpy as np -import pandas as pd from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random from nlu.pipe.viz.streamlit_viz.streamlit_viz_tracker import StreamlitVizTracker class NERStreamlitBlock(): @staticmethod diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/sentence_embedding_manifold.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/sentence_embedding_manifold.py index d8639bbb..3f1861c5 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/sentence_embedding_manifold.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/sentence_embedding_manifold.py @@ -1,15 +1,11 @@ import nlu from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils -from typing import List, Tuple, Optional, Dict, Union +from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils +from typing import List, Optional import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils import numpy as np import pandas as pd -from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS -from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random from nlu.pipe.viz.streamlit_viz.streamlit_viz_tracker import StreamlitVizTracker diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/token_features.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/token_features.py index c1786b94..d2230288 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/token_features.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/token_features.py @@ -1,15 +1,11 @@ import nlu from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils -from typing import List, Tuple, Optional, Dict, Union +from typing import List, Optional import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils -import numpy as np import pandas as pd from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random from nlu.pipe.viz.streamlit_viz.streamlit_viz_tracker import StreamlitVizTracker class TokenFeaturesStreamlitBlock(): @staticmethod diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/word_embedding_manifold.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/word_embedding_manifold.py index e8d2c2c4..004aff3c 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/word_embedding_manifold.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/word_embedding_manifold.py @@ -1,15 +1,11 @@ import nlu from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils -from typing import List, Tuple, Optional, Dict, Union +from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils +from typing import List, Optional import streamlit as st -from nlu.utils.modelhub.modelhub_utils import ModelHubUtils import numpy as np import pandas as pd -from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS -from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random from nlu.pipe.viz.streamlit_viz.streamlit_viz_tracker import StreamlitVizTracker class WordEmbeddingManifoldStreamlitBlock(): @staticmethod diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/word_similarity.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/word_similarity.py index f73fcb91..7566ed51 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/word_similarity.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/word_similarity.py @@ -1,7 +1,7 @@ import nlu from nlu.discovery import Discoverer -from nlu.pipe.utils.storage_ref_utils import StorageRefUtils -from typing import List, Tuple, Optional, Dict, Union +from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils +from typing import List, Tuple, Optional import streamlit as st from nlu.utils.modelhub.modelhub_utils import ModelHubUtils @@ -10,7 +10,6 @@ from nlu.pipe.viz.streamlit_viz.streamlit_utils_OS import StreamlitUtilsOS from nlu.pipe.viz.streamlit_viz.gen_streamlit_code import get_code_for_viz from nlu.pipe.viz.streamlit_viz.styles import _set_block_container_style -import random from nlu.pipe.viz.streamlit_viz.streamlit_viz_tracker import StreamlitVizTracker diff --git a/nlu/spellbook.py b/nlu/spellbook.py index b3a3ced0..20a69281 100644 --- a/nlu/spellbook.py +++ b/nlu/spellbook.py @@ -1,3 +1,6 @@ +import sparknlp + + class Spellbook(): # NLU model_base_names = # These reference tell NLU to which component resolved to route a request, they help NLU map a NLP reference to the correct class @@ -5,15 +8,15 @@ class Spellbook(): 'tfhub_use', 'distil', 'sentence_xlm.py', 'roberta', 'longformer', 'token_bert', 'token_distilbert' ] - sentence_embeddings = ['embed_sentence', 'use', 'bert', 'electra', 'tfhub_use'] + sentence_embeddings = ['embed_sentence', 'use', 'bert', 'electra', 'tfhub_use', 'doc2vec'] classifiers = ['classify', 'e2e', 'emotion', 'sentiment', 'ner', 'pos', 'trec6', 'trec50', 'questions', 'sarcasm', 'emotion', 'spam', 'fakenews', 'cyberbullying', 'wiki', 'wiki_7', 'wiki_20', 'yake', 'toxic', - 'assert', 'med_ner' + 'assert', 'med_ner', 'bert_sequence', 'distilbert_sequence' ] - token_classifiers = ['classify_token',] # TODO + token_classifiers = ['classify_token', ] # TODO seq2seq = ['t5', 'marian', 'translate_to'] actions = ['tokenize', 'sentence', 'embed', 'embed_sentence', 'embed_chunk', 'classify', 'chunk', 'pos', 'ner', 'dep', 'dep.untyped', 'lemma', 'match', 'norm', 'spell', 'stem', 'stopwords', 'clean', 'ngram', @@ -1816,7 +1819,7 @@ class Spellbook(): 'nl.ner.wikiner.glove.840B_300': 'wikiner_840B_300', 'nl.embed.bert': 'bert_base_dutch_cased', 'nl.embed_sentence.bert.base_cased': 'sent_bert_base_cased', - 'nl.embed.bert.base_cased' : 'bert_base_cased', + 'nl.embed.bert.base_cased': 'bert_base_cased', }, 'en': { @@ -2097,7 +2100,7 @@ class Spellbook(): 'en.ner.conll_longformer_large_4096': 'ner_conll_longformer_large_4096', # Spark NLP 3.2.2 NerDLModel - 'en.ner.conll_elmo': 'ner_conll_elmo', + # 'en.ner.conll_elmo': 'ner_conll_elmo', # broken 'en.ner.conll_albert_base_uncased': 'ner_conll_albert_base_uncased', 'en.ner.conll_albert_large_uncased': 'ner_conll_albert_large_uncased', 'en.ner.conll_xlnet_base_cased': 'ner_conll_xlnet_base_cased', @@ -2121,26 +2124,47 @@ class Spellbook(): 'en.embed_sentence.bert.wiki_books': 'sent_bert_wiki_books', 'en.embed_sentence.bert.pubmed_squad2': 'sent_bert_pubmed_squad2', 'en.embed_sentence.bert.pubmed': 'sent_bert_pubmed', - 'en.embed.bert.base_uncased_legal' : 'bert_base_uncased_legal', + 'en.embed.bert.base_uncased_legal': 'bert_base_uncased_legal', 'en.embed_sentence.bert.base_uncased_legal': 'sent_bert_base_uncased_legal', - 'en.classify.token_bert.classifier_ner_btc' :'bert_token_classifier_ner_btc', - - + 'en.classify.token_bert.classifier_ner_btc': 'bert_token_classifier_ner_btc', # Spark NLP 3.3.0 - 'en.classify.token_roberta_large_token_classifier_conll03' :'roberta_large_token_classifier_conll03', - 'en.classify.token_roberta_base_token_classifier_ontonotes' :'roberta_base_token_classifier_ontonotes', - 'en.classify.token_roberta_base_token_classifier_conll03' :'roberta_base_token_classifier_conll03', - 'en.classify.token_distilroberta_base_token_classifier_ontonotes' :'distilroberta_base_token_classifier_ontonotes', - 'en.classify.token_albert_large_token_classifier_conll03' :'albert_large_token_classifier_conll03', - 'en.classify.token_albert_base_token_classifier_conll03' :'albert_base_token_classifier_conll03', - 'en.classify.token_xlnet_base_token_classifier_conll03' :'xlnet_base_token_classifier_conll03', - - 'en.classify.token_roberta.large_token_classifier_ontonotes' : 'roberta_large_token_classifier_ontonotes', - 'en.classify.token_albert.xlarge_token_classifier_conll03' : 'albert_xlarge_token_classifier_conll03', - 'en.classify.token_xlnet.large_token_classifier_conll03' : 'xlnet_large_token_classifier_conll03', - 'en.classify.token_longformer.base_token_classifier_conll03' : 'longformer_base_token_classifier_conll03', - + 'en.classify.token_roberta_large_token_classifier_conll03': 'roberta_large_token_classifier_conll03', + 'en.classify.token_roberta_base_token_classifier_ontonotes': 'roberta_base_token_classifier_ontonotes', + 'en.classify.token_roberta_base_token_classifier_conll03': 'roberta_base_token_classifier_conll03', + 'en.classify.token_distilroberta_base_token_classifier_ontonotes': 'distilroberta_base_token_classifier_ontonotes', + 'en.classify.token_albert_large_token_classifier_conll03': 'albert_large_token_classifier_conll03', + 'en.classify.token_albert_base_token_classifier_conll03': 'albert_base_token_classifier_conll03', + 'en.classify.token_xlnet_base_token_classifier_conll03': 'xlnet_base_token_classifier_conll03', + + 'en.classify.token_roberta.large_token_classifier_ontonotes': 'roberta_large_token_classifier_ontonotes', + 'en.classify.token_albert.xlarge_token_classifier_conll03': 'albert_xlarge_token_classifier_conll03', + 'en.classify.token_xlnet.large_token_classifier_conll03': 'xlnet_large_token_classifier_conll03', + 'en.classify.token_longformer.base_token_classifier_conll03': 'longformer_base_token_classifier_conll03', + + # Spark NLP 3.3.2 + + 'en.classify.bert_sequence.imdb_large': 'bert_large_sequence_classifier_imdb', + 'en.classify.bert_sequence.imdb': 'bert_base_sequence_classifier_imdb', + 'en.classify.bert_sequence.ag_news': 'bert_base_sequence_classifier_ag_news', + 'en.classify.bert_sequence.dbpedia_14': 'bert_base_sequence_classifier_dbpedia_14', + 'en.classify.bert_sequence.finbert': 'bert_sequence_classifier_finbert', + 'en.classify.bert_sequence.dehatebert_mono': 'bert_sequence_classifier_dehatebert_mono', + + # Spark NLP 3.3.3 + 'en.classify.distilbert_sequence.sst2': 'distilbert_sequence_classifier_sst2', + 'en.classify.distilbert_sequence.policy': 'distilbert_sequence_classifier_policy', + 'en.classify.distilbert_sequence.industry': 'distilbert_sequence_classifier_industry', + 'en.classify.distilbert_sequence.emotion': 'distilbert_sequence_classifier_emotion', + 'en.classify.distilbert_sequence.banking77': 'distilbert_sequence_classifier_banking77', + + 'en.classify.distilbert_sequence.imdb': 'distilbert_base_sequence_classifier_imdb', + 'en.classify.distilbert_sequence.amazon_polarity': 'distilbert_base_sequence_classifier_amazon_polarity', + 'en.classify.distilbert_sequence.ag_news': 'distilbert_base_sequence_classifier_ag_news', + + 'en.embed_sentence.doc2vec': 'doc2vec_gigaword_300', + 'en.embed_sentence.doc2vec.gigaword_300': 'doc2vec_gigaword_300', + 'en.embed_sentence.doc2vec.gigaword_wiki_300': 'doc2vec_gigaword_wiki_300', }, @@ -2149,7 +2173,6 @@ class Spellbook(): }, - 'fr': { 'fr.lemma': 'lemma', 'fr.pos': 'pos_ud_gsd', # default pos fr @@ -2159,7 +2182,8 @@ class Spellbook(): 'fr.ner.wikiner.glove.840B_300': 'wikiner_840B_300', 'fr.stopwords': 'stopwords_fr', 'fr.ner.wikiner.glove.6B_300': 'wikiner_6B_300', - 'fr.classify.sentiment.bert' : 'classifierdl_bert_sentiment' + 'fr.classify.sentiment.bert': 'classifierdl_bert_sentiment', + 'fr.classify.distilbert_sequence.allocine': 'distilbert_multilingual_sequence_classifier_allocine', }, 'de': { @@ -2176,15 +2200,14 @@ class Spellbook(): 'de.embed.bert.uncased': 'bert_base_german_uncased', 'de.classify.news': 'classifierdl_bert_news', 'de.embed_sentence.bert.base_cased': 'sent_bert_base_cased', - 'de.classify.sentiment.bert' : 'classifierdl_bert_sentiment' - + 'de.classify.sentiment.bert': 'classifierdl_bert_sentiment' }, 'it': { 'it.lemma': 'lemma_dxc', # default lemma it 'it.lemma.dxc': 'lemma_dxc', - 'it.sentiment.dxc': 'sentiment_dxc', - 'it.sentiment': 'sentiment_dxc', # defauult sentiment it + # 'it.sentiment.dxc': 'sentiment_dxc', # Depracated + # 'it.sentiment': 'sentiment_dxc', # # Depracated 'it.pos': 'pos_ud_isdt', # default pos it 'it.pos.ud_isdt': 'pos_ud_isdt', 'it.ner': 'wikiner_840B_300', # default ner it @@ -2196,12 +2219,13 @@ class Spellbook(): }, 'nb': { 'nb.lemma': 'lemma', - 'nb.pos.ud_bokmaal': 'pos_ud_bokmaal', + # 'nb.pos.ud_bokmaal': 'pos_ud_bokmaal', # Depracated }, 'no': { - 'no.ner': 'norne_6B_100', # ner default no - 'no.ner.norne': 'norne_6B_100', # ner default no embeds + # Missing no emb ds + 'no.ner.norne.100d': 'norne_6B_100', # ner default no + 'no.ner.norne': 'norne_6B_100', 'no.ner.norne.glove.6B_100': 'norne_6B_100', 'no.ner.norne.glove.6B_300': 'norne_6B_300', 'no.ner.norne.glove.840B_300': 'norne_840B_300', @@ -2251,17 +2275,15 @@ class Spellbook(): }, - 'pa' : { + 'pa': { 'pa.detect_sentence': 'sentence_detector_dl', }, - 'ne' : { + 'ne': { 'ne.detect_sentence': 'sentence_detector_dl', }, - - 'es': { 'es.lemma': 'lemma', 'es.pos': 'pos_ud_gsd', # pos default es @@ -2273,12 +2295,11 @@ class Spellbook(): 'es.ner.wikiner.glove.840B_300': 'wikiner_840B_300', 'es.stopwords_es': 'stopwords_es', 'es.classify.token_bert.spanish_ner': 'bert_token_classifier_spanish_ner', - 'es.embed.bert.base_uncased' : 'bert_base_uncased', - 'es.embed.bert.base_cased' : 'bert_base_cased', + 'es.embed.bert.base_uncased': 'bert_base_uncased', + 'es.embed.bert.base_cased': 'bert_base_cased', 'es.embed_sentence.bert.base_uncased': 'sent_bert_base_uncased', 'es.embed_sentence.bert.base_cased': 'sent_bert_base_cased', - }, 'af': { 'af.stopwords': 'stopwords_af', @@ -2364,9 +2385,9 @@ class Spellbook(): 'fi.embed.bert': 'bert_finnish_cased', 'fi.embed.bert.cased': 'bert_finnish_cased', 'fi.embed.bert.uncased': 'bert_finnish_uncased', - 'fi.embed_sentence': 'sent_bert_finnish_cased', - 'fi.embed_sentence.bert.cased': 'sent_bert_finnish_cased', - 'fi.embed_sentence.bert.uncased': 'sent_bert_finnish_uncased', + # 'fi.embed_sentence': 'sent_bert_finnish_cased', # Broken bad TF Graph + # 'fi.embed_sentence.bert.cased': 'sent_bert_finnish_cased', # Broken bad TF Graph + # 'fi.embed_sentence.bert.uncased': 'sent_bert_finnish_uncased', # Broken bad TF Graph 'fi.ner.6B_100d': 'finnish_ner_6B_100', 'fi.ner.6B_300d': 'finnish_ner_6B_300', 'fi.ner.840B_300d': 'finnish_ner_840B_300', @@ -2384,15 +2405,14 @@ class Spellbook(): 'el.pos.ud_gdt': 'pos_ud_gdt', 'el.stopwords': 'stopwords_el', - 'el.embed.bert.base_uncased' : 'bert_base_uncased', + 'el.embed.bert.base_uncased': 'bert_base_uncased', 'el.embed_sentence.bert.base_uncased': 'sent_bert_base_uncased', }, 'ha': { 'ha.stopwords': 'stopwords_ha', - 'ha.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_hausa', - 'ha.embed_sentence.xlm_roberta' : 'sent_xlm_roberta_base_finetuned_hausa', #Default Sentence XLM - + 'ha.embed.xlm_roberta': 'xlm_roberta_base_finetuned_hausa', + 'ha.embed_sentence.xlm_roberta': 'sent_xlm_roberta_base_finetuned_hausa', # Default Sentence XLM }, 'he': { @@ -2425,13 +2445,13 @@ class Spellbook(): 'id.stopwords': 'stopwords_id', 'id.lemma': 'lemma', 'id.pos': 'pos_ud_gsd', + 'id.detect_sentence': 'sentence_detector_dl', }, 'ga': { 'ga.stopwords': 'stopwords_ga', 'ga.lemma': 'lemma', 'ga.pos': 'pos_ud_idt', - 'id.detect_sentence': 'sentence_detector_dl', }, 'da': { @@ -2452,10 +2472,10 @@ class Spellbook(): 'ja.ner.ud_gsd.glove_840B_300D': 'ner_ud_gsd_glove_840B_300d', 'ja.pos.ud_gsd': 'pos_ud_gsd', "ja.lemma": "lemma", - 'ja.embed.glove.cc_300d' : 'japanese_cc_300d', - 'ja.ner.ud_gsd_cc_300d' : 'ner_ud_gsd_cc_300d', - 'ja.ner.ud_gsd_xlm_roberta_base' : 'ner_ud_gsd_xlm_roberta_base', - 'ja.classify.token_bert.classifier_ner_ud_gsd' :'bert_token_classifier_ner_ud_gsd', + 'ja.embed.glove.cc_300d': 'japanese_cc_300d', + 'ja.ner.ud_gsd_cc_300d': 'ner_ud_gsd_cc_300d', + 'ja.ner.ud_gsd_xlm_roberta_base': 'ner_ud_gsd_xlm_roberta_base', + 'ja.classify.token_bert.classifier_ner_ud_gsd': 'bert_token_classifier_ner_ud_gsd', }, 'la': { 'la.stopwords': 'stopwords_la', @@ -2501,7 +2521,7 @@ class Spellbook(): 'fa.classify.token_bert.parsbert_armanner': 'bert_token_classifier_parsbert_armanner', 'fa.classify.token_bert.parsbert_ner': 'bert_token_classifier_parsbert_ner', 'fa.classify.token_bert.parsbert_peymaner': 'bert_token_classifier_parsbert_peymaner', - 'fa.classify.token_roberta_token_classifier_zwnj_base_ner' :'roberta_token_classifier_zwnj_base_ner', + 'fa.classify.token_roberta_token_classifier_zwnj_base_ner': 'roberta_token_classifier_zwnj_base_ner', }, 'ro': { @@ -2533,16 +2553,13 @@ class Spellbook(): }, - - - 'st': { 'st.stopwords': 'stopwords_st' }, 'sw': { 'sw.stopwords': 'stopwords_sw', - 'sw.embed_sentence.xlm_roberta' : 'sent_xlm_roberta_base_finetuned_swahili', #Default Sentence XLM - 'sw.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_swahili', # Default XLM Word Embd + 'sw.embed_sentence.xlm_roberta': 'sent_xlm_roberta_base_finetuned_swahili', # Default Sentence XLM + 'sw.embed.xlm_roberta': 'xlm_roberta_base_finetuned_swahili', # Default XLM Word Embd }, 'sv': { @@ -2555,8 +2572,8 @@ class Spellbook(): 'sv.ner.6B_300': 'swedish_ner_6B_300', 'sv.ner.840B_300': 'swedish_ner_840B_300', 'sv.classify.token_bert.swedish_ner': 'bert_token_classifier_swedish_ner', - 'sv.embed.bert.base_cased' : 'bert_base_cased', - 'sv.embed_sentence.bert.base_cased':'sent_bert_base_cased', + 'sv.embed.bert.base_cased': 'bert_base_cased', + 'sv.embed_sentence.bert.base_cased': 'sent_bert_base_cased', }, 'th': { @@ -2589,63 +2606,51 @@ class Spellbook(): }, - 'te': { 'te.detect_sentence': 'sentence_detector_dl', }, - 'yo': { 'yo.stopwords': 'stopwords_yo', 'yo.lemma': 'lemma', 'yo.pos': 'pos_ud_ytb', - 'yo.embed_sentence.xlm_roberta' : 'sent_xlm_roberta_base_finetuned_yoruba', #Default Sentence XLM - 'yo.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_yoruba', # Default XLM Word Embd - - + 'yo.embed_sentence.xlm_roberta': 'sent_xlm_roberta_base_finetuned_yoruba', # Default Sentence XLM + 'yo.embed.xlm_roberta': 'xlm_roberta_base_finetuned_yoruba', # Default XLM Word Embd }, 'ig': { - 'ig.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_igbo', - 'ig.embed_sentence.xlm_roberta' : 'sent_xlm_roberta_base_finetuned_igbo', #Default Sentence XLM - }, + 'ig.embed.xlm_roberta': 'xlm_roberta_base_finetuned_igbo', + 'ig.embed_sentence.xlm_roberta': 'sent_xlm_roberta_base_finetuned_igbo', # Default Sentence XLM + }, 'lg': { - 'lg.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_luganda', - 'lg.embed_sentence.xlm_roberta' : 'sent_xlm_roberta_base_finetuned_luganda', #Default Sentence XLM + 'lg.embed.xlm_roberta': 'xlm_roberta_base_finetuned_luganda', + 'lg.embed_sentence.xlm_roberta': 'sent_xlm_roberta_base_finetuned_luganda', # Default Sentence XLM }, - 'lou': { - 'lou.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_luo', # Default XLM Word Embd + 'lou.embed.xlm_roberta': 'xlm_roberta_base_finetuned_luo', # Default XLM Word Embd }, 'pcm': { - 'pcm.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_naija', # Default XLM Word Embd - 'pcm.embed_sentence.xlm_roberta' : 'sent_xlm_roberta_base_finetuned_naija', #Default Sentence XLM + 'pcm.embed.xlm_roberta': 'xlm_roberta_base_finetuned_naija', # Default XLM Word Embd + 'pcm.embed_sentence.xlm_roberta': 'sent_xlm_roberta_base_finetuned_naija', # Default Sentence XLM }, 'wo': { - 'wo.embed_sentence.xlm_roberta' : 'sent_xlm_roberta_base_finetuned_wolof', #Default Sentence XLM - 'wo.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_wolof', # Default XLM Word Embd + 'wo.embed_sentence.xlm_roberta': 'sent_xlm_roberta_base_finetuned_wolof', # Default Sentence XLM + 'wo.embed.xlm_roberta': 'xlm_roberta_base_finetuned_wolof', # Default XLM Word Embd }, 'rw': { - 'rw.embed_sentence.xlm_roberta' : 'sent_xlm_roberta_base_finetuned_kinyarwanda', #Default Sentence XLM - 'rw.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_kinyarwanda', + 'rw.embed_sentence.xlm_roberta': 'sent_xlm_roberta_base_finetuned_kinyarwanda', # Default Sentence XLM + 'rw.embed.xlm_roberta': 'xlm_roberta_base_finetuned_kinyarwanda', }, - - - - - - - 'zu': { 'zu.stopwords': 'stopwords_zu' }, @@ -2663,7 +2668,7 @@ class Spellbook(): 'zh.pos': 'pos_ud_gsd', # default zh pos, 'zh.pos.ud_gsd': 'pos_ud_gsd', - 'zh.pos.ctb9': 'pos_ctb9', + # 'zh.pos.ctb9': 'pos_ctb9', "zh.pos.ud_gsd_trad": "pos_ud_gsd_trad", 'zh.ner': 'ner_msra_bert_768d', # default zh ner, @@ -2681,7 +2686,7 @@ class Spellbook(): 'et': { 'et.lemma': 'lemma', - 'et.pos': 'pos_ud_edt', + # 'et.pos': 'pos_ud_edt', }, 'ur': { 'ur.lemma': 'lemma', @@ -2694,15 +2699,16 @@ class Spellbook(): 'ur.embed.urdu_vec_140M_300d': 'urduvec_140M_300d', 'ur.ner': 'uner_mk_140M_300d', 'ur.ner.mk_140M_300d': 'uner_mk_140M_300d', + 'ur.classify.distilbert_sequence.imdb': 'distilbert_base_sequence_classifier_imdb', }, 'ko': { 'ko.segment_words': 'wordseg_kaist_ud', - 'ko.pos': 'pos_ud_kaist', # default ko pos + # 'ko.pos': 'pos_ud_kaist', # default ko pos 'ko.ner': 'ner_kmou_glove_840B_300d', # default ko ner - 'ko.pos.ud_kaist': 'pos_ud_kaist', + # 'ko.pos.ud_kaist': 'pos_ud_kaist', 'ko.ner.kmou': 'ner_kmou_glove_840B_300d', # default ner kmou 'ko.ner.kmou.glove_840B_300d': 'ner_kmou_glove_840B_300d', "ko.lemma": "lemma", @@ -2716,15 +2722,15 @@ class Spellbook(): 'am': { "am.pos": "pos_ud_att", "am.lemma": "lemma", - 'am.embed.xlm_roberta' : 'xlm_roberta_base_finetuned_amharic', - 'am.embed_sentence.xlm_roberta' : 'sent_xlm_roberta_base_finetuned_amharic', #Default Sentence XLM + 'am.embed.xlm_roberta': 'xlm_roberta_base_finetuned_amharic', + 'am.embed_sentence.xlm_roberta': 'sent_xlm_roberta_base_finetuned_amharic', # Default Sentence XLM }, 'xx': { # NLU 3.3.0 - 'xx.embed.xlm_roberta_large' : 'xlm_roberta_large', - 'xx.classify.token_xlm_roberta.token_classifier_ner_40_lang' : 'xlm_roberta_token_classifier_ner_40_lang', + 'xx.embed.xlm_roberta_large': 'xlm_roberta_large', + 'xx.classify.token_xlm_roberta.token_classifier_ner_40_lang': 'xlm_roberta_token_classifier_ner_40_lang', # Spark NLP 3.2.2 'xx.embed.bert.muril': 'bert_muril', @@ -2753,8 +2759,8 @@ class Spellbook(): 'xx.embed.bert_multi_cased': 'bert_multi_cased', 'xx.embed.bert': 'bert_multi_cased', - 'xx.classify.wiki_7': 'ld_wiki_7', - 'xx.classify.wiki_20': 'ld_wiki_20', + # 'xx.classify.wiki_7': 'ld_wiki_7', # Depcrecated + # 'xx.classify.wiki_20': 'ld_wiki_20' # Depcrecated , 'xx.classify.wiki_21': 'ld_wiki_tatoeba_cnn_21', 'xx.classify.wiki_21.bigru': 'ld_tatoeba_bigru_21', 'xx.classify.wiki_99': 'ld_tatoeba_cnn_99', @@ -2779,10 +2785,10 @@ class Spellbook(): # 2.7 'xx.use.multi': 'tfhub_use_multi', 'xx.use.multi_lg': 'tfhub_use_multi_lg', - 'xx.use.xling_en_de': 'tfhub_use_xling_en_de', - 'xx.use.xling_en_es': 'tfhub_use_xling_en_es', - 'xx.use.xling_en_fr': 'tfhub_use_xling_en_fr', - 'xx.use.xling_many': 'tfhub_use_xling_many', + # 'xx.use.xling_en_de': 'tfhub_use_xling_en_de', + # 'xx.use.xling_en_es': 'tfhub_use_xling_en_es', + # 'xx.use.xling_en_fr': 'tfhub_use_xling_en_fr', + # 'xx.use.xling_many': 'tfhub_use_xling_many', 'xx.use.multi' # 2.7.0 marian translate model references @@ -4126,22 +4132,19 @@ class Spellbook(): # 'en.spell.context.med':'context_spell_med', #todo crashing Byteerror # 3.3.0 Healthcare - 'en.spell.drug_norvig':'spellcheck_drug_norvig', + 'en.spell.drug_norvig': 'spellcheck_drug_norvig', 'en.classify.token_bert.ner_bacteria': 'bert_token_classifier_ner_bacteria', 'en.classify.token_bert.ner_anatomy': 'bert_token_classifier_ner_anatomy', 'en.classify.token_bert.ner_drugs': 'bert_token_classifier_ner_drugs', 'en.classify.token_bert.ner_jsl_slim': 'bert_token_classifier_ner_jsl_slim', 'en.classify.token_bert.ner_ade': 'bert_token_classifier_ner_ade', - 'en.resolve.rxnorm_ndc' : 'sbiobertresolve_rxnorm_ndc', - 'en.resolve.icd10cm_generalised' : 'sbiobertresolve_icd10cm_generalised', - 'en.resolve.hcpcs' : 'sbiobertresolve_hcpcs', - 'en.med_ner.chexpert' : 'ner_chexpert', - - + 'en.resolve.rxnorm_ndc': 'sbiobertresolve_rxnorm_ndc', + 'en.resolve.icd10cm_generalised': 'sbiobertresolve_icd10cm_generalised', + 'en.resolve.hcpcs': 'sbiobertresolve_hcpcs', + 'en.med_ner.chexpert': 'ner_chexpert', # 3.2.3 Healthcare - 'en.classify.token_bert.ner_deid' : 'bert_token_classifier_ner_deid', - + 'en.classify.token_bert.ner_deid': 'bert_token_classifier_ner_deid', 'en.detect_sentence.clinical': 'sentence_detector_dl_healthcare', 'en.norm_drugs': 'drug_normalizer', # TODO!?!??! @@ -4360,7 +4363,7 @@ class Spellbook(): 'en.med_ner.ade_biobert': 'ner_ade_biobert', 'en.classify.ade.clinicalbert': 'classifierdl_ade_clinicalbert', - 'en.med_ner.large': 'ner_large_clinical', + # 'en.med_ner.large': 'ner_large_clinical', # Deprecated # DeIdentificationModel @@ -4372,7 +4375,6 @@ class Spellbook(): 'en.de_identify.rb': 'deidentify_rb', 'en.de_identify.rb_no_regex': 'deidentify_rb_no_regex', - # 'en.classify.icd10.clinical':'classifier_icd10cm_hcc_clinical', # WHCIH CLASS? # TODO NOT LAODING # 'en.classify.icd10.healthcare':'classifier_icd10cm_hcc_healthcare', # TODO NOT LOADING CORRECt 'en.classify.ade.biobert': 'classifierdl_ade_biobert', @@ -4402,7 +4404,7 @@ class Spellbook(): 'en.resolve.rxnorm_disposition': 'sbiobertresolve_rxnorm_disposition', 'en.resolve.rxnorm_disposition.sbert': 'sbertresolve_rxnorm_disposition', - 'en.resolve.biobert_base_cased_mli': 'sbiobert_base_cased_mli', + # 'en.resolve.biobert_base_cased_mli': 'sbiobert_base_cased_mli', 'en.med_ner.jsl_slim': 'ner_jsl_slim', 'en.med_ner.jsl_greedy_biobert': 'ner_jsl_greedy_biobert', @@ -4410,10 +4412,30 @@ class Spellbook(): 'en.classify.token_bert.ner_jsl': 'bert_token_classifier_ner_jsl', # 3.3.1 healthcare - 'en.classify.token_bert.ner_chemical':'bert_token_classifier_ner_chemicals', - 'en.resolve.umls_disease_syndrome' : 'sbiobertresolve_umls_disease_syndrome', - 'en.resolve.umls_clinical_drugs' : 'sbiobertresolve_umls_clinical_drugs', - + 'en.classify.token_bert.ner_chemical': 'bert_token_classifier_ner_chemicals', + + ## 3.3.3 HC + 'en.med_ner.deid_subentity_augmented_i2b2': 'ner_deid_subentity_augmented_i2b2', + 'en.med_ner.biomarker': 'ner_biomarker', + 'en.med_ner.nihss': 'ner_nihss', + 'en.extract_relation.nihss': 'redl_nihss_biobert', + 'en.resolve.mesh': 'sbiobertresolve_mesh', + 'en.resolve.mli': 'sbiobert_base_cased_mli', + 'en.resolve.ndc': 'sbiobertresolve_ndc', + 'en.resolve.loinc.augmented': 'sbiobertresolve_loinc_augmented', + 'en.resolve.clinical_snomed_procedures_measurements': 'sbiobertresolve_clinical_snomed_procedures_measurements', + 'es.embed.roberta_base_biomedical': 'roberta_base_biomedical', + 'es.med_ner.roberta_ner_diag_proc': 'roberta_ner_diag_proc', + 'es.resolve.snomed': 'robertaresolve_snomed', + 'en.med_ner.covid_trials': 'ner_covid_trials', + 'en.med_ner.chemd': 'ner_chemd_clinical', + 'en.classify.token_bert.bionlp': 'bert_token_classifier_ner_bionlp', + 'en.classify.token_bert.cellular': 'bert_token_classifier_ner_cellular', + 'en.classify.token_bert.chemicals': 'bert_token_classifier_ner_chemicals', + 'en.resolve.rxnorm_augmented': 'sbiobertresolve_rxnorm_augmented', + 'en.resolve.umls_disease_syndrome': 'sbiobertresolve_umls_disease_syndrome', + 'en.resolve.umls_clinical_drugs': 'sbiobertresolve_umls_clinical_drugs', + 'en.classify.bert_sequence.question_statement_clinical': 'bert_sequence_classifier_question_statement_clinical', }, @@ -4426,8 +4448,8 @@ class Spellbook(): # 'de.med_ner.healthcare' :'ner_healthcare', # BAD NER TRAINED ON STORAGE_REF embeddings_healthcare_100d which only exist in EN 'de.med_ner': 'ner_healthcare_slim', 'de.med_ner.traffic': 'ner_traffic', - 'de.resolve.icd10gm' :'sbertresolve_icd10gm', - 'de.resolve.snomed' :'sbertresolve_snomed', + 'de.resolve.icd10gm': 'sbertresolve_icd10gm', + 'de.resolve.snomed': 'sbertresolve_snomed', }, @@ -4444,13 +4466,16 @@ class Spellbook(): 'es.med_ner': 'ner_diag_proc', 'es.med_ner.neoplasm': 'ner_neoplasms', 'es.med_ner.diag_proc': 'ner_diag_proc', + + 'es.embed.roberta_base_biomedical': 'roberta_base_biomedical', + 'es.med_ner.roberta_ner_diag_proc': 'roberta_ner_diag_proc', + 'es.resolve.snomed': 'robertaresolve_snomed', } } pretrained_healthcare_pipe_references = { 'en': { - - 'en.med_ner.profiling_clinical' : 'ner_profiling_clinical', - 'en.med_ner.profiling_biobert' : 'ner_profiling_biobert', + 'en.med_ner.profiling_clinical': 'ner_profiling_clinical', + 'en.med_ner.profiling_biobert': 'ner_profiling_biobert', 'en.resolve.icd10cm.umls': 'icd10cm_umls_mapping', 'en.resolve.mesh.umls': 'mesh_umls_mapping', 'en.resolve.rxnorm.umls': 'rxnorm_umls_mapping', @@ -4472,68 +4497,88 @@ class Spellbook(): } # map storage ref to nlu ref storage_ref_2_nlu_ref = { - - 'en': { - 'tfhub_use': 'en.embed_sentence.use', - 'glove_100d': 'en.embed.glove.100d', - 'xlm_roberta_base': 'xx.embed.xlm' - }, - 'zh': { - 'bert_base_chinese': 'zh.embed', - }, - 'th': { - 'tfhub_use_multi_lg': 'xx.use.multi' - - }, - 'ur': { - 'glove_300d': 'ur.embed', - }, - 'fr': - { - 'labse': 'xx.embed_sentence.labse' + + 'en': { + 'tfhub_use': 'en.embed_sentence.use', + 'glove_100d': 'en.embed.glove.100d', + 'xlm_roberta_base': 'xx.embed.xlm' }, - 'tr': - {'bert_multi_cased': 'xx.embed.bert', - 'labse': 'xx.embed_sentence.labse' - }, - 'sv': - {'glove_100d': 'xx.embed.glove.glove_6B_100', - }, - 'fa': - {'glove_300d': 'fa.embed', - }, - 'he': - {'glove_300d': 'he.embed', - }, - 'fi': - {'glove_100d': 'fi.embed.bert', - }, - 'ar': - {'glove_300d': 'ar.embed', - }, - 'de': - { - 'sent_bert_multi_cased': 'xx.embed_sentence', - 'labse': 'xx.embed_sentence.labse', - 'clinical' : 'de.embed_sentence.bert.base_cased', + 'zh': { + 'bert_base_chinese': 'zh.embed', + }, + 'th': { + 'tfhub_use_multi_lg': 'xx.use.multi' }, - 'ja': - { - 'fasttext_300_ja': 'ja.embed.glove.cc_300d', - 'xlm_roberta_base': 'xx.embed.xlm', + 'ur': { + 'glove_300d': 'ur.embed', + }, + 'fr': + { + 'labse': 'xx.embed_sentence.labse' + }, + 'tr': + {'bert_multi_cased': 'xx.embed.bert', + 'labse': 'xx.embed_sentence.labse' + }, + 'sv': + {'glove_100d': 'xx.embed.glove.glove_6B_100', + }, + 'fa': + {'glove_300d': 'fa.embed', + }, + 'he': + {'glove_300d': 'he.embed', + }, + 'fi': + { - } + 'glove_100d': 'xx.embed.glove.glove_6B_100', + + }, + 'ar': + {'glove_300d': 'ar.embed', + }, + 'de': + { + 'sent_bert_multi_cased': 'xx.embed_sentence', + 'labse': 'xx.embed_sentence.labse', + 'clinical': 'de.embed_sentence.bert.base_cased', + + }, + 'ja': + { + 'fasttext_300_ja': 'ja.embed.glove.cc_300d', + 'xlm_roberta_base': 'xx.embed.xlm', + }, + 'no': { + 'glove_100d': 'xx.embed.glove.glove_6B_100', + }, + 'pl': { + 'glove_100d': 'xx.embed.glove.glove_6B_100', + }, + 'pt': { + 'glove_100d': 'xx.embed.glove.glove_6B_100', + }, + 'ru': { + 'glove_100d': 'xx.embed.glove.glove_6B_100', + }, + 'es': { + 'glove_100d': 'xx.embed.glove.glove_6B_100', + }, -} # + 'da': { + 'glove_100d': 'xx.embed.glove.glove_6B_100', + }, + } # licensed_storage_ref_2_nlu_ref = { 'en': { 'clinical': 'en.embed.glove.clinical', 'biobert_pubmed_base_cased': 'biobert', - + 'ROBERTA_EMBEDDINGS_39f3e48e5c3f': 'en.embed_sentence.biobert.clinical_base_cased', # 'embeddings_healthcare100' : 'en.embed.glove.clinical', 'embeddings_healthcare100': 'en.embed.glove.healthcare_100d', 'BERT_SENTENCE_EMBEDDINGS_0bee53f1b2cc': 'en.embed_sentence.biobert.mli', @@ -4552,7 +4597,13 @@ class Spellbook(): }, 'es': { 'embeddings_scielowiki300': 'es.embed.scielowiki.300d', + 'ROBERTA_EMBEDDINGS_39f3e48e5c3f': 'es.embed.roberta_base_biomedical', + 'clinical': 'es.embed.roberta_base_biomedical', + # 'clinical' : 'es.embed.roberta_base_biomedical', - } + }, + 'de': { + 'clinical' : 'de.embed_sentence.bert.base_cased', + } } diff --git a/nlu/utils/environment/offline_load_utils.py b/nlu/utils/environment/offline_load_utils.py index 40b540bb..708019e4 100644 --- a/nlu/utils/environment/offline_load_utils.py +++ b/nlu/utils/environment/offline_load_utils.py @@ -1,10 +1,10 @@ import os,sys, json import nlu -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent # from nlu.pipe.pipe_logic import PipelineQueryVerifier from nlu.pipe.pipeline import * -from nlu.pipe.pipe_components import SparkNLUComponent +from nlu.pipe.pipe_component import SparkNLUComponent from pyspark.ml import PipelineModel from sparknlp.annotator import * diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index 7a26e29c..155eef0b 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -1,8 +1,8 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="3.3.1" -PYSPARK="3.1.2" -NLU="3.3.0" +SPARKNLP="3.3.4" +PYSPARK="3.0.3" +NLU="3.3.1" SPARKHOME="/content/spark-3.1.1-bin-hadoop2.7" while getopts s:p: option diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index c3a06dfc..cbb5ffc7 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -1,9 +1,9 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="3.3.1" +SPARKNLP="3.3.4" PYSPARK="3.0.3" -NLU="3.3.0" -SPARKHOME="spark-3.1.1-bin-hadoop2.7" +NLU="3.3.1" +SPARKHOME="spark-3.0.3-bin-hadoop2.7" while getopts s:p: option do