We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
2024-10-11 10:15:17.627 | INFO | bertrend.train:train_BERTopic:486 - Reducing outliers via embeddings strategy... 2024-10-11 10:15:17,720 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings. ╭──────────────────────────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────────────────────────╮ │ /home/dev-dsia/dev/BERTrend/bertrend_apps/newsletters/main.py:131 in newsletter_from_feed │ │ │ │ 128 │ │ │ │ 129 │ │ else: │ │ 130 │ │ │ # learn and predict │ │ ❱ 131 │ │ │ topics, topic_model = _train_topic_model(config, dataset) │ │ 132 │ │ │ # train topic model with the dataset │ │ 133 │ │ │ if model_path: │ │ 134 │ │ │ │ logger.info(f"Saving topic model to: {model_path}") │ │ │ │ ╭─────────────────────────────────────────────────────────────── locals ───────────────────────────────────────────────────────────────╮ │ │ │ config = <configparser.ConfigParser object at 0x7f28b482f210> │ │ │ │ data_feed_cfg = <configparser.ConfigParser object at 0x7f28b482e910> │ │ │ │ data_feed_cfg_path = PosixPath('/home/dev-dsia/dev/BERTrend/bertrend_apps/config/feeds/llm_en_feed.cfg') │ │ │ │ dataset = │ index ... timestamp │ │ │ │ 0 0 ... 2024-10-09 22:40:00 │ │ │ │ 1 1 ... 2024-10-09 22:40:00 │ │ │ │ 2 2 ... 2024-10-09 22:40:00 │ │ │ │ 3 3 ... 2024-10-09 22:40:00 │ │ │ │ 4 4 ... 2024-10-09 22:40:00 │ │ │ │ ... ... ... ... │ │ │ │ 4751 4751 ... 2024-09-26 07:00:00 │ │ │ │ 4752 4752 ... 2024-09-26 07:00:00 │ │ │ │ 4753 4753 ... 2024-09-26 07:00:00 │ │ │ │ 4754 4754 ... 2024-09-26 07:00:00 │ │ │ │ 4755 4755 ... 2024-09-26 07:00:00 │ │ │ │ │ │ │ │ [4756 rows x 7 columns] │ │ │ │ learning_strategy = <Section: learning_strategy> │ │ │ │ learning_type = 'learn_from_last' │ │ │ │ model_path = PosixPath('/scratch/nlp/output/bertrend/models/llm_from_last_en') │ │ │ │ newsletter_cfg_path = PosixPath('/home/dev-dsia/dev/BERTrend/bertrend_apps/config/newsletters/llm_en_newsletter_from_last.cfg') │ │ │ │ newsletter_params = <Section: newsletter> │ │ │ │ original_dataset = │ index ... timestamp │ │ │ │ 0 0 ... 2024-10-06 09:05:37 │ │ │ │ 1 1 ... 2024-09-30 07:00:00 │ │ │ │ 2 2 ... 2024-09-30 14:55:27 │ │ │ │ 3 3 ... 2024-10-03 17:09:00 │ │ │ │ 4 4 ... 2024-10-06 07:00:00 │ │ │ │ .. ... ... ... │ │ │ │ 242 242 ... 2024-10-06 17:14:04 │ │ │ │ 243 243 ... 2024-10-08 15:00:00 │ │ │ │ 244 244 ... 2024-09-28 07:00:00 │ │ │ │ 245 245 ... 2024-09-28 07:00:00 │ │ │ │ 246 246 ... 2024-09-27 07:00:00 │ │ │ │ │ │ │ │ [247 rows x 7 columns] │ │ │ │ split_data_by_paragraphs = True │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │ │ │ │ /home/dev-dsia/dev/BERTrend/bertrend_apps/newsletters/main.py:221 in _train_topic_model │ │ │ │ 218 │ │ if topic_params.get("nr_topics") == 0: │ │ 219 │ │ │ topic_params["nr_topics"] = None │ │ 220 │ │ │ │ ❱ 221 │ │ topic_model, topics, _, _, _, _ = train_BERTopic( │ │ 222 │ │ │ **topic_params, │ │ 223 │ │ │ full_dataset=dataset, │ │ 224 │ │ │ embedding_model_name=embedding_model_name, │ │ │ │ ╭──────────────────────────────────────────────────────────────────── locals ────────────────────────────────────────────────────────────────────╮ │ │ │ config = <configparser.ConfigParser object at 0x7f28b482f210> │ │ │ │ ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) │ │ │ │ dataset = │ index ... timestamp │ │ │ │ 0 0 ... 2024-10-09 22:40:00 │ │ │ │ 1 1 ... 2024-10-09 22:40:00 │ │ │ │ 2 2 ... 2024-10-09 22:40:00 │ │ │ │ 3 3 ... 2024-10-09 22:40:00 │ │ │ │ 4 4 ... 2024-10-09 22:40:00 │ │ │ │ ... ... ... ... │ │ │ │ 4751 4751 ... 2024-09-26 07:00:00 │ │ │ │ 4752 4752 ... 2024-09-26 07:00:00 │ │ │ │ 4753 4753 ... 2024-09-26 07:00:00 │ │ │ │ 4754 4754 ... 2024-09-26 07:00:00 │ │ │ │ 4755 4755 ... 2024-09-26 07:00:00 │ │ │ │ │ │ │ │ [4756 rows x 7 columns] │ │ │ │ embedding_model_name = 'sentence-transformers/all-mpnet-base-v2' │ │ │ │ hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=10, prediction_data=True) │ │ │ │ topic_params = {'nr_topics': None, 'top_n_words': 5} │ │ │ │ umap_model = UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, tqdm_kwds={'bar_format': '{desc}: │ │ │ │ {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}) │ │ │ │ vectorizer_model = CountVectorizer(min_df=2, │ │ │ │ │ │ │ │ stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', │ │ │ │ │ │ │ │ │ │ │ 'ourselves', 'you', "you're", "you've", "you'll", │ │ │ │ │ │ │ │ │ │ │ "you'd", 'your', 'yours', 'yourself', 'yourselves', │ │ │ │ │ │ │ │ │ │ │ 'he', 'him', 'his', 'himself', 'she', "she's", │ │ │ │ │ │ │ │ │ │ │ 'her', 'hers', 'herself', 'it', "it's", 'its', │ │ │ │ │ │ │ │ │ │ │ 'itself', ...]) │ │ │ ╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │ │ │ │ /home/dev-dsia/dev/BERTrend/bertrend/train.py:494 in train_BERTopic │ │ │ │ 491 │ │ strategy="embeddings", │ │ 492 │ ) │ │ 493 │ │ │ ❱ 494 │ topic_model.update_topics( │ │ 495 │ │ filtered_dataset[column], │ │ 496 │ │ topics=new_topics, │ │ 497 │ │ vectorizer_model=vectorizer_model, │ │
The text was updated successfully, but these errors were encountered:
Log complets cron_newsletter_newsletter_decarbonation_from_last.log
Sorry, something went wrong.
A priori lié à un changement sur la gestion des outliers dans bertopic > 0.6.2
Workaround: imposer bertopic=0.6.2
No branches or pull requests
2024-10-11 10:15:17.627 | INFO | bertrend.train:train_BERTopic:486 - Reducing outliers via embeddings strategy...
2024-10-11 10:15:17,720 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
╭──────────────────────────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────────────────────────╮
│ /home/dev-dsia/dev/BERTrend/bertrend_apps/newsletters/main.py:131 in newsletter_from_feed │
│ │
│ 128 │ │ │
│ 129 │ │ else: │
│ 130 │ │ │ # learn and predict │
│ ❱ 131 │ │ │ topics, topic_model = _train_topic_model(config, dataset) │
│ 132 │ │ │ # train topic model with the dataset │
│ 133 │ │ │ if model_path: │
│ 134 │ │ │ │ logger.info(f"Saving topic model to: {model_path}") │
│ │
│ ╭─────────────────────────────────────────────────────────────── locals ───────────────────────────────────────────────────────────────╮ │
│ │ config = <configparser.ConfigParser object at 0x7f28b482f210> │ │
│ │ data_feed_cfg = <configparser.ConfigParser object at 0x7f28b482e910> │ │
│ │ data_feed_cfg_path = PosixPath('/home/dev-dsia/dev/BERTrend/bertrend_apps/config/feeds/llm_en_feed.cfg') │ │
│ │ dataset = │ index ... timestamp │ │
│ │ 0 0 ... 2024-10-09 22:40:00 │ │
│ │ 1 1 ... 2024-10-09 22:40:00 │ │
│ │ 2 2 ... 2024-10-09 22:40:00 │ │
│ │ 3 3 ... 2024-10-09 22:40:00 │ │
│ │ 4 4 ... 2024-10-09 22:40:00 │ │
│ │ ... ... ... ... │ │
│ │ 4751 4751 ... 2024-09-26 07:00:00 │ │
│ │ 4752 4752 ... 2024-09-26 07:00:00 │ │
│ │ 4753 4753 ... 2024-09-26 07:00:00 │ │
│ │ 4754 4754 ... 2024-09-26 07:00:00 │ │
│ │ 4755 4755 ... 2024-09-26 07:00:00 │ │
│ │ │ │
│ │ [4756 rows x 7 columns] │ │
│ │ learning_strategy = <Section: learning_strategy> │ │
│ │ learning_type = 'learn_from_last' │ │
│ │ model_path = PosixPath('/scratch/nlp/output/bertrend/models/llm_from_last_en') │ │
│ │ newsletter_cfg_path = PosixPath('/home/dev-dsia/dev/BERTrend/bertrend_apps/config/newsletters/llm_en_newsletter_from_last.cfg') │ │
│ │ newsletter_params = <Section: newsletter> │ │
│ │ original_dataset = │ index ... timestamp │ │
│ │ 0 0 ... 2024-10-06 09:05:37 │ │
│ │ 1 1 ... 2024-09-30 07:00:00 │ │
│ │ 2 2 ... 2024-09-30 14:55:27 │ │
│ │ 3 3 ... 2024-10-03 17:09:00 │ │
│ │ 4 4 ... 2024-10-06 07:00:00 │ │
│ │ .. ... ... ... │ │
│ │ 242 242 ... 2024-10-06 17:14:04 │ │
│ │ 243 243 ... 2024-10-08 15:00:00 │ │
│ │ 244 244 ... 2024-09-28 07:00:00 │ │
│ │ 245 245 ... 2024-09-28 07:00:00 │ │
│ │ 246 246 ... 2024-09-27 07:00:00 │ │
│ │ │ │
│ │ [247 rows x 7 columns] │ │
│ │ split_data_by_paragraphs = True │ │
│ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │
│ │
│ /home/dev-dsia/dev/BERTrend/bertrend_apps/newsletters/main.py:221 in _train_topic_model │
│ │
│ 218 │ │ if topic_params.get("nr_topics") == 0: │
│ 219 │ │ │ topic_params["nr_topics"] = None │
│ 220 │ │ │
│ ❱ 221 │ │ topic_model, topics, _, _, _, _ = train_BERTopic( │
│ 222 │ │ │ **topic_params, │
│ 223 │ │ │ full_dataset=dataset, │
│ 224 │ │ │ embedding_model_name=embedding_model_name, │
│ │
│ ╭──────────────────────────────────────────────────────────────────── locals ────────────────────────────────────────────────────────────────────╮ │
│ │ config = <configparser.ConfigParser object at 0x7f28b482f210> │ │
│ │ ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) │ │
│ │ dataset = │ index ... timestamp │ │
│ │ 0 0 ... 2024-10-09 22:40:00 │ │
│ │ 1 1 ... 2024-10-09 22:40:00 │ │
│ │ 2 2 ... 2024-10-09 22:40:00 │ │
│ │ 3 3 ... 2024-10-09 22:40:00 │ │
│ │ 4 4 ... 2024-10-09 22:40:00 │ │
│ │ ... ... ... ... │ │
│ │ 4751 4751 ... 2024-09-26 07:00:00 │ │
│ │ 4752 4752 ... 2024-09-26 07:00:00 │ │
│ │ 4753 4753 ... 2024-09-26 07:00:00 │ │
│ │ 4754 4754 ... 2024-09-26 07:00:00 │ │
│ │ 4755 4755 ... 2024-09-26 07:00:00 │ │
│ │ │ │
│ │ [4756 rows x 7 columns] │ │
│ │ embedding_model_name = 'sentence-transformers/all-mpnet-base-v2' │ │
│ │ hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=10, prediction_data=True) │ │
│ │ topic_params = {'nr_topics': None, 'top_n_words': 5} │ │
│ │ umap_model = UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, tqdm_kwds={'bar_format': '{desc}: │ │
│ │ {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}) │ │
│ │ vectorizer_model = CountVectorizer(min_df=2, │ │
│ │ │ │ │ │ stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', │ │
│ │ │ │ │ │ │ │ │ 'ourselves', 'you', "you're", "you've", "you'll", │ │
│ │ │ │ │ │ │ │ │ "you'd", 'your', 'yours', 'yourself', 'yourselves', │ │
│ │ │ │ │ │ │ │ │ 'he', 'him', 'his', 'himself', 'she', "she's", │ │
│ │ │ │ │ │ │ │ │ 'her', 'hers', 'herself', 'it', "it's", 'its', │ │
│ │ │ │ │ │ │ │ │ 'itself', ...]) │ │
│ ╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │
│ │
│ /home/dev-dsia/dev/BERTrend/bertrend/train.py:494 in train_BERTopic │
│ │
│ 491 │ │ strategy="embeddings", │
│ 492 │ ) │
│ 493 │ │
│ ❱ 494 │ topic_model.update_topics( │
│ 495 │ │ filtered_dataset[column], │
│ 496 │ │ topics=new_topics, │
│ 497 │ │ vectorizer_model=vectorizer_model, │
│
The text was updated successfully, but these errors were encountered: