-
Notifications
You must be signed in to change notification settings - Fork 653
/
video_demo.py
109 lines (94 loc) · 3.47 KB
/
video_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
# coding: utf-8
import torch
import torchvision.transforms as transforms
import mobilenet_v1
import numpy as np
import cv2
import dlib
from utils.ddfa import ToTensorGjz, NormalizeGjz
import scipy.io as sio
from utils.inference import (
parse_roi_box_from_landmark,
crop_img,
predict_68pts,
predict_dense,
)
from utils.cv_plot import plot_kpt
from utils.render import get_depths_image, cget_depths_image, cpncc
from utils.paf import gen_img_paf
import argparse
import torch.backends.cudnn as cudnn
STD_SIZE = 120
def main(args):
# 0. open video
# vc = cv2.VideoCapture(str(args.video) if len(args.video) == 1 else args.video)
vc = cv2.VideoCapture(args.video if int(args.video) != 0 else 0)
# 1. load pre-tained model
checkpoint_fp = 'models/phase1_wpdc_vdc.pth.tar'
arch = 'mobilenet_1'
tri = sio.loadmat('visualize/tri.mat')['tri']
transform = transforms.Compose([ToTensorGjz(), NormalizeGjz(mean=127.5, std=128)])
checkpoint = torch.load(checkpoint_fp, map_location=lambda storage, loc: storage)[
'state_dict'
]
model = getattr(mobilenet_v1, arch)(
num_classes=62
) # 62 = 12(pose) + 40(shape) +10(expression)
model_dict = model.state_dict()
# because the model is trained by multiple gpus, prefix module should be removed
for k in checkpoint.keys():
model_dict[k.replace('module.', '')] = checkpoint[k]
model.load_state_dict(model_dict)
if args.mode == 'gpu':
cudnn.benchmark = True
model = model.cuda()
model.eval()
# 2. load dlib model for face detection and landmark used for face cropping
dlib_landmark_model = 'models/shape_predictor_68_face_landmarks.dat'
face_regressor = dlib.shape_predictor(dlib_landmark_model)
face_detector = dlib.get_frontal_face_detector()
# 3. forward
success, frame = vc.read()
last_frame_pts = []
while success:
if len(last_frame_pts) == 0:
rects = face_detector(frame, 1)
for rect in rects:
pts = face_regressor(frame, rect).parts()
pts = np.array([[pt.x, pt.y] for pt in pts]).T
last_frame_pts.append(pts)
vertices_lst = []
for lmk in last_frame_pts:
roi_box = parse_roi_box_from_landmark(lmk)
img = crop_img(frame, roi_box)
img = cv2.resize(
img, dsize=(STD_SIZE, STD_SIZE), interpolation=cv2.INTER_LINEAR
)
input = transform(img).unsqueeze(0)
with torch.no_grad():
if args.mode == 'gpu':
input = input.cuda()
param = model(input)
param = param.squeeze().cpu().numpy().flatten().astype(np.float32)
pts68 = predict_68pts(param, roi_box)
vertex = predict_dense(param, roi_box)
lmk[:] = pts68[:2]
vertices_lst.append(vertex)
pncc = cpncc(frame, vertices_lst, tri - 1) / 255.0
frame = frame / 255.0 * (1.0 - pncc)
cv2.imshow('3ddfa', frame)
cv2.waitKey(1)
success, frame = vc.read()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='3DDFA inference pipeline')
parser.add_argument(
'-v',
'--video',
default='0',
type=str,
help='video file path or opencv cam index',
)
parser.add_argument('-m', '--mode', default='cpu', type=str, help='gpu or cpu mode')
args = parser.parse_args()
main(args)