-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathapp-server.py
160 lines (132 loc) · 5.42 KB
/
app-server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""
Flask Server for TARS-AI Application.
This script provides a Flask-based API server to handle image captioning
and audio transcription tasks using whisper models.
"""
from flask import Flask, request, jsonify
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import traceback
from faster_whisper import WhisperModel
from flask_cors import CORS
from io import BytesIO
from datetime import datetime
# Initialize Flask app and enable CORS
app = Flask(__name__)
CORS(app)
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize BLIP model for image captioning
def initialize_blip_model():
"""Load BLIP model and processor for image captioning."""
try:
processor = BlipProcessor.from_pretrained(
"Salesforce/blip-image-captioning-base", cache_dir="./src/vision"
)
model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base", cache_dir="./src/vision"
)
model.to(device)
return processor, model
except Exception as e:
print("Error initializing BLIP model:", traceback.format_exc())
raise e
# Initialize Whisper model for audio transcription
"""
# Whisper Model Options
# OpenAI Whisper provides a range of models with varying size, speed, and accuracy:
1. Tiny
- Size: ~39 MB
- Use Case: Basic transcription tasks prioritizing speed over accuracy.
2. Base
- Size: ~74 MB
- Use Case: Quick transcription in moderately constrained environments.
3. Small
- Size: ~244 MB
- Use Case: Real-time transcription where quality matters more.
4. Medium
- Size: ~769 MB
- Use Case: Transcription tasks with high accuracy requirements.
5. Large
- Size: ~1.55 GB
- Use Case: Critical transcription tasks where accuracy is paramount.
"""
def initialize_whisper_model(
model_size="large-v3",
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
compute_type="int8_float8" # Use "float16" or "int8" for optimization
):
"""Load Whisper model for audio transcription using faster-whisper."""
try:
return WhisperModel(
model_size,
device=device,
compute_type=compute_type
)
except Exception as e:
print("Error initializing Whisper model:", traceback.format_exc())
raise e
# Initialize models globally within the main block
blip_processor = None
blip_model = None
whisper_model = None
# Routes
@app.route('/caption', methods=['POST'])
def caption_image():
"""Endpoint to generate a caption for an uploaded image."""
try:
if 'image' not in request.files:
return jsonify({"error": "No image file provided"}), 400
image_file = request.files['image']
image_bytes = BytesIO(image_file.read())
# Debug: Save the received file for inspection
#with open("received_image_debug.jpg", "wb") as f:
#f.write(image_bytes.getvalue())
#print("DEBUG: Saved received image as received_image_debug.jpg")
try:
image = Image.open(image_bytes)
#print(f"DEBUG: Image loaded successfully. Format: {image.format}")
image = image.convert("RGB") # Convert to RGB to ensure BLIP compatibility
except Exception as e:
#print(f"DEBUG: Failed to open/convert image: {traceback.format_exc()}")
return jsonify({"error": "Invalid image file"}), 400
#print(f"DEBUG: Image format: {image.format}, Size: {image.size}, Mode: {image.mode}")
# Process the image with BLIP
inputs = blip_processor(image, return_tensors="pt").to(device)
outputs = blip_model.generate(**inputs, max_new_tokens=100, num_beams=3)
caption = blip_processor.decode(outputs[0], skip_special_tokens=True)
return jsonify({"caption": caption})
except Exception as e:
print("Error occurred during caption generation:", traceback.format_exc())
return jsonify({"error": str(e)}), 500
@app.route('/save_audio', methods=['POST'])
def save_audio():
"""Endpoint to transcribe uploaded audio using Whisper."""
#print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Accessed")
try:
if 'audio' not in request.files:
return jsonify({"error": "No audio file provided"}), 400
audio_blob = request.files['audio']
audio_bytes = BytesIO(audio_blob.read())
segments, _ = whisper_model.transcribe(audio_bytes, beam_size=5)
transcription = [
{"text": segment.text, "start": segment.start, "end": segment.end}
for segment in segments
]
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Transcription: {transcription}")
return jsonify({"transcription": transcription})
except Exception as e:
print("Error occurred during audio transcription:", traceback.format_exc())
return jsonify({"error": str(e)}), 500
if __name__ == '__main__':
try:
blip_processor, blip_model = initialize_blip_model()
whisper_model = initialize_whisper_model(
model_size="tiny",
device="cuda" if torch.cuda.is_available() else "cpu",
compute_type="int8_float16" if torch.cuda.is_available() else "int8"
)
app.run(host='0.0.0.0', port=5678)
except Exception as e:
print("Critical error during initialization:", traceback.format_exc())