diff --git a/web_demo_streamlit-minicpmv2_6.py b/web_demo_streamlit-minicpmv2_6.py index 882450e..0fa5858 100644 --- a/web_demo_streamlit-minicpmv2_6.py +++ b/web_demo_streamlit-minicpmv2_6.py @@ -173,91 +173,97 @@ def uniform_sample(frame_indices, num_samples): return frames + # User input box user_text = st.chat_input("Enter your question") -if user_text: - # Display user input and save it to session history - with st.chat_message(U_NAME, avatar="user"): - st.session_state.chat_history.append({ - "role": "user", - "content": user_text, - "image": None, - "video": None - }) - st.markdown(f"{U_NAME}: {user_text}") - - # Generate responses using the model - model = st.session_state.model - tokenizer = st.session_state.tokenizer - content_list = [] # Store the content (text or image) that will be passed into the model - imageFile = None - - with st.chat_message(A_NAME, avatar="assistant"): - # Handle different inputs depending on the mode selected by the user - if selected_mode == "Single Image": - # Single image mode: pass in the last uploaded image - print("Single Images mode in use") - if len(st.session_state.chat_history) > 1 and len(st.session_state.uploaded_image_list) >= 1: - uploaded_image = st.session_state.uploaded_image_list[-1] - if uploaded_image: - imageFile = Image.open(uploaded_image).convert('RGB') - content_list.append(imageFile) - else: - print("Single Images mode: No image found") - - elif selected_mode == "Multiple Images": - # Multi-image mode: pass in all the images uploaded last time - print("Multiple Images mode in use") - if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_image_num >= 1: - for uploaded_image in st.session_state.uploaded_image_list: - imageFile = Image.open(uploaded_image).convert('RGB') - content_list.append(imageFile) - else: - print("Multiple Images mode: No image found") - - elif selected_mode == "Video": - # Video mode: pass in slice frames of uploaded video - print("Video mode in use") - if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_video_num == 1: - uploaded_video_path = st.session_state.uploaded_video_list[-1] - if uploaded_video_path: - frames = encode_video(uploaded_video_path) +if user_text is not None: + if user_text.strip() is "": + st.warning('Input message could not be empty!', icon="⚠️") + else: + # Display user input and save it to session history + with st.chat_message(U_NAME, avatar="user"): + st.session_state.chat_history.append({ + "role": "user", + "content": user_text, + "image": None, + "video": None + }) + st.markdown(f"{U_NAME}: {user_text}") + + # Generate responses using the model + model = st.session_state.model + tokenizer = st.session_state.tokenizer + content_list = [] # Store the content (text or image) that will be passed into the model + imageFile = None + + with st.chat_message(A_NAME, avatar="assistant"): + # Handle different inputs depending on the mode selected by the user + if selected_mode == "Single Image": + # Single image mode: pass in the last uploaded image + print("Single Images mode in use") + if len(st.session_state.chat_history) > 1 and len(st.session_state.uploaded_image_list) >= 1: + uploaded_image = st.session_state.uploaded_image_list[-1] + if uploaded_image: + imageFile = Image.open(uploaded_image).convert('RGB') + content_list.append(imageFile) + else: + print("Single Images mode: No image found") + + elif selected_mode == "Multiple Images": + # Multi-image mode: pass in all the images uploaded last time + print("Multiple Images mode in use") + if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_image_num >= 1: + for uploaded_image in st.session_state.uploaded_image_list: + imageFile = Image.open(uploaded_image).convert('RGB') + content_list.append(imageFile) + else: + print("Multiple Images mode: No image found") + + elif selected_mode == "Video": + # Video mode: pass in slice frames of uploaded video + print("Video mode in use") + if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_video_num == 1: + uploaded_video_path = st.session_state.uploaded_video_list[-1] + if uploaded_video_path: + frames = encode_video(uploaded_video_path) + else: + print("Video Mode: No video found") + + # Defining model parameters + params = { + 'sampling': True, + 'top_p': top_p, + 'top_k': top_k, + 'temperature': temperature, + 'repetition_penalty': repetition_penalty, + "max_new_tokens": max_length, + "stream": True + } + + # Set different input parameters depending on whether to upload a video + if st.session_state.uploaded_video_num == 1 and selected_mode == "Video": + msgs = [{"role": "user", "content": frames + [user_text]}] + # Set decode params for video + params["max_inp_length"] = 4352 # Set the maximum input length of the video mode + params["use_image_id"] = False # Do not use image_id + params["max_slice_nums"] = 1 # # use 1 if cuda OOM and video resolution > 448*448 else: - print("Video Mode: No video found") - - # Defining model parameters - params = { - 'sampling': True, - 'top_p': top_p, - 'top_k': top_k, - 'temperature': temperature, - 'repetition_penalty': repetition_penalty, - "max_new_tokens": max_length, - "stream": True - } - - # Set different input parameters depending on whether to upload a video - if st.session_state.uploaded_video_num == 1 and selected_mode == "Video": - msgs = [{"role": "user", "content": frames + [user_text]}] - # Set decode params for video - params["max_inp_length"] = 4352 # Set the maximum input length of the video mode - params["use_image_id"] = False # Do not use image_id - params["max_slice_nums"] = 1 # # use 1 if cuda OOM and video resolution > 448*448 - else: - content_list.append(user_text) - msgs = [{"role": "user", "content": content_list}] - - print("content_list:", content_list) # debug - print("params:", params) # debug - - # Generate and display the model's responses - response = model.chat(image=None, msgs=msgs, context=None, tokenizer=tokenizer, **params) - st.session_state.response = st.write_stream(response) - st.session_state.chat_history.append({ - "role": "model", - "content": st.session_state.response, - "image": None, - "video": None - }) - - st.divider() # Add separators to the interface + content_list.append(user_text) + msgs = [{"role": "user", "content": content_list}] + + print("content_list:", content_list) # debug + print("params:", params) # debug + + # Generate and display the model's responses + with st.spinner('AI is thinking...'): + response = model.chat(image=None, msgs=msgs, context=None, tokenizer=tokenizer, **params) + st.session_state.response = st.write_stream(response) + st.session_state.chat_history.append({ + "role": "model", + "content": st.session_state.response, + "image": None, + "video": None + }) + + st.divider() # Add separators to the interface +