forked from evallen/querycap
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
108 lines (86 loc) · 3.96 KB
/
model.py
File metadata and controls
108 lines (86 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import pathlib
from flask import Flask, request
from werkzeug.utils import secure_filename
# from transformers import BlipProcessor, BlipForQuestionAnswering
# from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
from PIL import Image
import whisper
from datetime import datetime
import requests
from transformers import BlipProcessor, Blip2ForConditionalGeneration
# from transformers import BlipProcessor, BlipForQuestionAnswering
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip2-flan-t5-xl")
blip_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")
IMAGE_FOLDER = "./images"
SOUND_FOLDER = "./sounds"
app = Flask(__name__)
app.config['IMAGE_FOLDER'] = IMAGE_FOLDER
app.config['SOUND_FOLDER'] = SOUND_FOLDER
pathlib.Path(IMAGE_FOLDER).mkdir(parents=True, exist_ok=True)
pathlib.Path(SOUND_FOLDER).mkdir(parents=True, exist_ok=True)
# blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
# blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")
# blip_model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
# blip_processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
whisper_model = whisper.load_model('base')
@app.after_request
def after(response):
print(response.status)
print(response.headers)
print(response.get_data())
return response
def process_image_sound(image_file, sound_file):
print('processing now')
question = whisper_model.transcribe(sound_file)['text']
print(f"QUESTION: {question}")
raw_image = Image.open(image_file).convert('RGB')
inputs = blip_processor(raw_image, f"Question: {question} Answer: ", return_tensors='pt')
out = blip_model.generate(**inputs, max_new_tokens=256, min_length=10)
# out = blip_model.generate(**inputs,
# num_beams=5,
# max_length=256,
# min_length=1,
# repetition_penalty=1.5,
# length_penalty=1.0,
# temperature=1)``
answer = blip_processor.batch_decode(out, skip_special_tokens=True)[0].strip()
# answer = blip_processor.decode(out[0], skip_special_tokens=True)
print(f"ANSWER: {answer}")
# inputs = blip_processor(raw_image, "Question: What is in this image? Answer: ", return_tensors='pt')
# out = blip_model.generate(**inputs, max_new_tokens=256)
# # out = blip_model.generate(**inputs,
# # num_beams=5,
# # max_length=256,
# # min_length=1,
# # repetition_penalty=1.5,
# # length_penalty=1.0,
# # temperature=1)
# print(f"DEBUG DESCRIPTION OF IMAGE <{image_file}>: {blip_processor.batch_decode(out, skip_special_tokens=True)[0].strip()}")
return answer
def check_for_file(name):
if name not in request.files:
print(f"didn't find file {name}")
return {
"success": False,
"message": f"'{name}' file not found in request.",
}, 400
@app.route("/send_image", methods=['POST'])
def send_image():
check_for_file('image')
check_for_file('sound')
image = request.files['image']
sound = request.files['sound']
now = datetime.now()
now_str = now.strftime("%Y-%m-%d_%H-%M-%S")
image_filename = now_str + secure_filename(image.filename)
image_filepath = os.path.join(app.config['IMAGE_FOLDER'], image_filename)
image.save(image_filepath)
sound_filename = now_str + secure_filename(sound.filename)
sound_filepath = os.path.join(app.config['SOUND_FOLDER'], sound_filename)
sound.save(sound_filepath)
answer = process_image_sound(image_filepath, sound_filepath)
return {
"success": True,
"answer": answer,
}, 200