diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..83712563708fb2f6449e8e955fddaa3e9944ea71 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +assets/docs/showcase.gif filter=lfs diff=lfs merge=lfs -text +assets/docs/showcase2.gif filter=lfs diff=lfs merge=lfs -text +assets/examples/driving/d0.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/driving/d3.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/driving/d6.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/driving/d9.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..07050fdfd51205238c1139a14725cdc412a71290 --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +**/__pycache__/ +*.py[cod] +**/*.py[cod] +*$py.class + +# Model weights +**/*.pth +**/*.onnx + +# Ipython notebook +*.ipynb + +# Temporary files or benchmark resources +animations/* +tmp/* diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000000000000000000000000000000000..1bca84ccf9fed7936fc93d2704ff4eab6c734728 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,19 @@ +{ + "[python]": { + "editor.tabSize": 4 + }, + "files.eol": "\n", + "files.insertFinalNewline": true, + "files.trimFinalNewlines": true, + "files.trimTrailingWhitespace": true, + "files.exclude": { + "**/.git": true, + "**/.svn": true, + "**/.hg": true, + "**/CVS": true, + "**/.DS_Store": true, + "**/Thumbs.db": true, + "**/*.crswap": true, + "**/__pycache__": true + } +} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9e8f5026e9273b98745188ec4bbc8ac05b2b22ef --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Kuaishou Visual Generation and Interaction Center + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..a82443bdc9fdf15e1f43a571f49076a8f61e37c0 --- /dev/null +++ b/app.py @@ -0,0 +1,154 @@ +# coding: utf-8 + +""" +The entrance of the gradio +""" + +import tyro +import gradio as gr +import os.path as osp +from src.utils.helper import load_description +from src.gradio_pipeline import GradioPipeline +from src.config.crop_config import CropConfig +from src.config.argument_config import ArgumentConfig +from src.config.inference_config import InferenceConfig + + +def partial_fields(target_class, kwargs): + return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)}) + + +# set tyro theme +tyro.extras.set_accent_color("bright_cyan") +args = tyro.cli(ArgumentConfig) + +# specify configs for inference +inference_cfg = partial_fields(InferenceConfig, args.__dict__) # use attribute of args to initial InferenceConfig +crop_cfg = partial_fields(CropConfig, args.__dict__) # use attribute of args to initial CropConfig +gradio_pipeline = GradioPipeline( + inference_cfg=inference_cfg, + crop_cfg=crop_cfg, + args=args +) +# assets +title_md = "assets/gradio_title.md" +example_portrait_dir = "assets/examples/source" +example_video_dir = "assets/examples/driving" +data_examples = [ + [osp.join(example_portrait_dir, "s9.jpg"), osp.join(example_video_dir, "d0.mp4"), True, True, True, True], + [osp.join(example_portrait_dir, "s6.jpg"), osp.join(example_video_dir, "d0.mp4"), True, True, True, True], + [osp.join(example_portrait_dir, "s10.jpg"), osp.join(example_video_dir, "d5.mp4"), True, True, True, True], + [osp.join(example_portrait_dir, "s5.jpg"), osp.join(example_video_dir, "d6.mp4"), True, True, True, True], + [osp.join(example_portrait_dir, "s7.jpg"), osp.join(example_video_dir, "d7.mp4"), True, True, True, True], +] +#################### interface logic #################### + +# Define components first +eye_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="target eyes-open ratio") +lip_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="target lip-open ratio") +retargeting_input_image = gr.Image(type="numpy") +output_image = gr.Image(type="numpy") +output_image_paste_back = gr.Image(type="numpy") +output_video = gr.Video() +output_video_concat = gr.Video() + +with gr.Blocks(theme=gr.themes.Soft()) as demo: + gr.HTML(load_description(title_md)) + gr.Markdown(load_description("assets/gradio_description_upload.md")) + with gr.Row(): + with gr.Accordion(open=True, label="Source Portrait"): + image_input = gr.Image(type="filepath") + with gr.Accordion(open=True, label="Driving Video"): + video_input = gr.Video() + gr.Markdown(load_description("assets/gradio_description_animation.md")) + with gr.Row(): + with gr.Accordion(open=True, label="Animation Options"): + with gr.Row(): + flag_relative_input = gr.Checkbox(value=True, label="relative motion") + flag_do_crop_input = gr.Checkbox(value=True, label="do crop") + flag_remap_input = gr.Checkbox(value=True, label="paste-back") + with gr.Row(): + with gr.Column(): + process_button_animation = gr.Button("๐Ÿš€ Animate", variant="primary") + with gr.Column(): + process_button_reset = gr.ClearButton([image_input, video_input, output_video, output_video_concat], value="๐Ÿงน Clear") + with gr.Row(): + with gr.Column(): + with gr.Accordion(open=True, label="The animated video in the original image space"): + output_video.render() + with gr.Column(): + with gr.Accordion(open=True, label="The animated video"): + output_video_concat.render() + with gr.Row(): + # Examples + gr.Markdown("## You could choose the examples below โฌ‡๏ธ") + with gr.Row(): + gr.Examples( + examples=data_examples, + inputs=[ + image_input, + video_input, + flag_relative_input, + flag_do_crop_input, + flag_remap_input + ], + examples_per_page=5 + ) + gr.Markdown(load_description("assets/gradio_description_retargeting.md")) + with gr.Row(): + eye_retargeting_slider.render() + lip_retargeting_slider.render() + with gr.Row(): + process_button_retargeting = gr.Button("๐Ÿš— Retargeting", variant="primary") + process_button_reset_retargeting = gr.ClearButton( + [ + eye_retargeting_slider, + lip_retargeting_slider, + retargeting_input_image, + output_image, + output_image_paste_back + ], + value="๐Ÿงน Clear" + ) + with gr.Row(): + with gr.Column(): + with gr.Accordion(open=True, label="Retargeting Input"): + retargeting_input_image.render() + with gr.Column(): + with gr.Accordion(open=True, label="Retargeting Result"): + output_image.render() + with gr.Column(): + with gr.Accordion(open=True, label="Paste-back Result"): + output_image_paste_back.render() + # binding functions for buttons + process_button_retargeting.click( + fn=gradio_pipeline.execute_image, + inputs=[eye_retargeting_slider, lip_retargeting_slider], + outputs=[output_image, output_image_paste_back], + show_progress=True + ) + process_button_animation.click( + fn=gradio_pipeline.execute_video, + inputs=[ + image_input, + video_input, + flag_relative_input, + flag_do_crop_input, + flag_remap_input + ], + outputs=[output_video, output_video_concat], + show_progress=True + ) + image_input.change( + fn=gradio_pipeline.prepare_retargeting, + inputs=image_input, + outputs=[eye_retargeting_slider, lip_retargeting_slider, retargeting_input_image] + ) + +########################################################## + +demo.launch( + server_name=args.server_name, + server_port=args.server_port, + share=args.share, +) diff --git a/assets/docs/inference.gif b/assets/docs/inference.gif new file mode 100644 index 0000000000000000000000000000000000000000..7e18022e5245dcb6449df6d190b538d5ca024e06 Binary files /dev/null and b/assets/docs/inference.gif differ diff --git a/assets/docs/showcase.gif b/assets/docs/showcase.gif new file mode 100644 index 0000000000000000000000000000000000000000..fae84c2d3550a37446e482286b70902b21e2e232 --- /dev/null +++ b/assets/docs/showcase.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bca5f38bfd555bf7c013312d87883afdf39d97fba719ac171c60f897af49e21 +size 6623248 diff --git a/assets/docs/showcase2.gif b/assets/docs/showcase2.gif new file mode 100644 index 0000000000000000000000000000000000000000..29175c0eeb85b9db0ffd61e3e9281dffe3536352 --- /dev/null +++ b/assets/docs/showcase2.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb1fffb139681775780b2956e7d0289f55d199c1a3e14ab263887864d4b0d586 +size 2881351 diff --git a/assets/examples/driving/d0.mp4 b/assets/examples/driving/d0.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..92391dd3ff235fc82f29b7cc77fe4a7ce183d934 --- /dev/null +++ b/assets/examples/driving/d0.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f6f9962e1fdf6e6722172e7a18155204858d5d5ce3b1e0646c150360c33bed +size 2958395 diff --git a/assets/examples/driving/d1.mp4 b/assets/examples/driving/d1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e2825c1d0663cb82aadc035f2cfbc9c75288a766 Binary files /dev/null and b/assets/examples/driving/d1.mp4 differ diff --git a/assets/examples/driving/d2.mp4 b/assets/examples/driving/d2.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a14da2d8a07d6b88f4581635052982c15b15d988 Binary files /dev/null and b/assets/examples/driving/d2.mp4 differ diff --git a/assets/examples/driving/d3.mp4 b/assets/examples/driving/d3.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8b70b6aa3c0e566a4fa3e5959f2d3b916e99b708 --- /dev/null +++ b/assets/examples/driving/d3.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef5c86e49b1b43dcb1449b499eb5a7f0cbae2f78aec08b5598193be1e4257099 +size 1430968 diff --git a/assets/examples/driving/d5.mp4 b/assets/examples/driving/d5.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..332bc88cc665df3135662086f1b916b3789b380a Binary files /dev/null and b/assets/examples/driving/d5.mp4 differ diff --git a/assets/examples/driving/d6.mp4 b/assets/examples/driving/d6.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..44f351385cef843b21b03fab8c3b10e0c005ec5e --- /dev/null +++ b/assets/examples/driving/d6.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00e3ea79bbf28cbdc4fbb67ec655d9a0fe876e880ec45af55ae481348d0c0fff +size 1967790 diff --git a/assets/examples/driving/d7.mp4 b/assets/examples/driving/d7.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..81b5ae1a6f301d613100547cce5ba06eeb033e12 Binary files /dev/null and b/assets/examples/driving/d7.mp4 differ diff --git a/assets/examples/driving/d8.mp4 b/assets/examples/driving/d8.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7fabddedb916f57eb35d114b0a9c88c7a212d68f Binary files /dev/null and b/assets/examples/driving/d8.mp4 differ diff --git a/assets/examples/driving/d9.mp4 b/assets/examples/driving/d9.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7803b3bf5c460a79d94e5cfbedb0de1f52d449d2 --- /dev/null +++ b/assets/examples/driving/d9.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a414aa1d547be35306d692065a2157434bf40a6025ba8e30ce12e5bb322cc33 +size 2257929 diff --git a/assets/examples/source/s0.jpg b/assets/examples/source/s0.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ef44c593be38cea30422fff9ed986a8a77889348 Binary files /dev/null and b/assets/examples/source/s0.jpg differ diff --git a/assets/examples/source/s1.jpg b/assets/examples/source/s1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ebacda3519a1452aee239f7e104d2c6ff40beb25 Binary files /dev/null and b/assets/examples/source/s1.jpg differ diff --git a/assets/examples/source/s10.jpg b/assets/examples/source/s10.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ee9616b592f070fbe90a8717da01477e8d4ee01f Binary files /dev/null and b/assets/examples/source/s10.jpg differ diff --git a/assets/examples/source/s2.jpg b/assets/examples/source/s2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e851bd20b65c552266a87bb87a9b509e3ea56f7d Binary files /dev/null and b/assets/examples/source/s2.jpg differ diff --git a/assets/examples/source/s3.jpg b/assets/examples/source/s3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9f3ba2a358e5b88450e7466761dff3e983e18e16 Binary files /dev/null and b/assets/examples/source/s3.jpg differ diff --git a/assets/examples/source/s4.jpg b/assets/examples/source/s4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..17f611bf942ad168d4e4d03b7e5c42d6650c4be1 Binary files /dev/null and b/assets/examples/source/s4.jpg differ diff --git a/assets/examples/source/s5.jpg b/assets/examples/source/s5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9abad7ef061b93579a373cf141d38710d9b1e32d Binary files /dev/null and b/assets/examples/source/s5.jpg differ diff --git a/assets/examples/source/s6.jpg b/assets/examples/source/s6.jpg new file mode 100644 index 0000000000000000000000000000000000000000..91c13d5f2b48d143ca596566ad10f0a0e5693da4 Binary files /dev/null and b/assets/examples/source/s6.jpg differ diff --git a/assets/examples/source/s7.jpg b/assets/examples/source/s7.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cf96f2d5651f7ae0faf08193ecd3df282c5c3b53 Binary files /dev/null and b/assets/examples/source/s7.jpg differ diff --git a/assets/examples/source/s8.jpg b/assets/examples/source/s8.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b415ed1d4a4e5cf01e6dc30d6b4ced20814558d5 Binary files /dev/null and b/assets/examples/source/s8.jpg differ diff --git a/assets/examples/source/s9.jpg b/assets/examples/source/s9.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3ef7251ba10bf83356587016b126a52bdbca7b18 Binary files /dev/null and b/assets/examples/source/s9.jpg differ diff --git a/assets/gradio_description_animation.md b/assets/gradio_description_animation.md new file mode 100644 index 0000000000000000000000000000000000000000..34b38979bd7420a76b9b37b42600e42a34e22665 --- /dev/null +++ b/assets/gradio_description_animation.md @@ -0,0 +1,7 @@ +๐Ÿ”ฅ To animate the source portrait with the driving video, please follow these steps: +
+ 1. Specify the options in the Animation Options section. We recommend checking the do crop option when facial areas occupy a relatively small portion of your image. +
+
+ 2. Press the ๐Ÿš€ Animate button and wait for a moment. Your animated video will appear in the result block. This may take a few moments. +
diff --git a/assets/gradio_description_retargeting.md b/assets/gradio_description_retargeting.md new file mode 100644 index 0000000000000000000000000000000000000000..a99796dc0482c99afe48895c4aa0c6c5a3224770 --- /dev/null +++ b/assets/gradio_description_retargeting.md @@ -0,0 +1 @@ +๐Ÿ”ฅ To change the target eyes-open and lip-open ratio of the source portrait, please drag the sliders and then click the ๐Ÿš— Retargeting button. The result would be shown in the middle block. You can try running it multiple times. ๐Ÿ˜Š Set both ratios to 0.8 to see what's going on! diff --git a/assets/gradio_description_upload.md b/assets/gradio_description_upload.md new file mode 100644 index 0000000000000000000000000000000000000000..46a5fa5b94f4bcbc811ea25f3b48eb1613b6afcb --- /dev/null +++ b/assets/gradio_description_upload.md @@ -0,0 +1,2 @@ +## ๐Ÿค— This is the official gradio demo for **LivePortrait**. +
Please upload or use the webcam to get a source portrait to the Source Portrait field and a driving video to the Driving Video field.
diff --git a/assets/gradio_title.md b/assets/gradio_title.md new file mode 100644 index 0000000000000000000000000000000000000000..e2b765e17e85419c3cf512c75e435fc892d64a4a --- /dev/null +++ b/assets/gradio_title.md @@ -0,0 +1,10 @@ +
+
+

LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control

+
+ Project Page + +
+
+
diff --git a/inference.py b/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..8387e7f01657b64430f50603b45557d8ace3304f --- /dev/null +++ b/inference.py @@ -0,0 +1,33 @@ +# coding: utf-8 + +import tyro +from src.config.argument_config import ArgumentConfig +from src.config.inference_config import InferenceConfig +from src.config.crop_config import CropConfig +from src.live_portrait_pipeline import LivePortraitPipeline + + +def partial_fields(target_class, kwargs): + return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)}) + + +def main(): + # set tyro theme + tyro.extras.set_accent_color("bright_cyan") + args = tyro.cli(ArgumentConfig) + + # specify configs for inference + inference_cfg = partial_fields(InferenceConfig, args.__dict__) # use attribute of args to initial InferenceConfig + crop_cfg = partial_fields(CropConfig, args.__dict__) # use attribute of args to initial CropConfig + + live_portrait_pipeline = LivePortraitPipeline( + inference_cfg=inference_cfg, + crop_cfg=crop_cfg + ) + + # run + live_portrait_pipeline.execute(args) + + +if __name__ == '__main__': + main() diff --git a/pretrained_weights/.gitkeep b/pretrained_weights/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/readme.md b/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..f394e2ca0f4402a1f6e22e659cd562d80f7673a0 --- /dev/null +++ b/readme.md @@ -0,0 +1,143 @@ +

LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control

+ +
+ Jianzhu Guo 1โ€   + Dingyun Zhang 1,2  + Xiaoqiang Liu 1  + Zhizhou Zhong 1,3  + Yuan Zhang 1  +
+ +
+ Pengfei Wan 1  + Di Zhang 1  +
+ +
+ 1 Kuaishou Technology  2 University of Science and Technology of China  3 Fudan University  +
+ +
+
+ + + +
+
+ +

+ showcase +
+ ๐Ÿ”ฅ For more results, visit our homepage ๐Ÿ”ฅ +

+ + + +## ๐Ÿ”ฅ Updates +- **`2024/07/04`**: ๐Ÿ”ฅ We released the initial version of the inference code and models. Continuous updates, stay tuned! +- **`2024/07/04`**: ๐Ÿ˜Š We released the [homepage](https://liveportrait.github.io) and technical report on [arXiv](https://arxiv.org/pdf/2407.03168). + +## Introduction +This repo, named **LivePortrait**, contains the official PyTorch implementation of our paper [LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control](https://arxiv.org/pdf/2407.03168). +We are actively updating and improving this repository. If you find any bugs or have suggestions, welcome to raise issues or submit pull requests (PR) ๐Ÿ’–. + +## ๐Ÿ”ฅ Getting Started +### 1. Clone the code and prepare the environment +```bash +git clone https://github.com/KwaiVGI/LivePortrait +cd LivePortrait + +# create env using conda +conda create -n LivePortrait python==3.9.18 +conda activate LivePortrait +# install dependencies with pip +pip install -r requirements.txt +``` + +### 2. Download pretrained weights +Download our pretrained LivePortrait weights and face detection models of InsightFace from [Google Drive](https://drive.google.com/drive/folders/1UtKgzKjFAOmZkhNK-OYT0caJ_w2XAnib) or [Baidu Yun](https://pan.baidu.com/s/1MGctWmNla_vZxDbEp2Dtzw?pwd=z5cn). We have packed all weights in one directory ๐Ÿ˜Š. Unzip and place them in `./pretrained_weights` ensuring the directory structure is as follows: +```text +pretrained_weights +โ”œโ”€โ”€ insightface +โ”‚ โ””โ”€โ”€ models +โ”‚ โ””โ”€โ”€ buffalo_l +โ”‚ โ”œโ”€โ”€ 2d106det.onnx +โ”‚ โ””โ”€โ”€ det_10g.onnx +โ””โ”€โ”€ liveportrait + โ”œโ”€โ”€ base_models + โ”‚ โ”œโ”€โ”€ appearance_feature_extractor.pth + โ”‚ โ”œโ”€โ”€ motion_extractor.pth + โ”‚ โ”œโ”€โ”€ spade_generator.pth + โ”‚ โ””โ”€โ”€ warping_module.pth + โ”œโ”€โ”€ landmark.onnx + โ””โ”€โ”€ retargeting_models + โ””โ”€โ”€ stitching_retargeting_module.pth +``` + +### 3. Inference ๐Ÿš€ + +```bash +python inference.py +``` + +If the script runs successfully, you will get an output mp4 file named `animations/s6--d0_concat.mp4`. This file includes the following results: driving video, input image, and generated result. + +

+ image +

+ +Or, you can change the input by specifying the `-s` and `-d` arguments: + +```bash +python inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d0.mp4 + +# or disable pasting back +python inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d0.mp4 --no_flag_pasteback + +# more options to see +python inference.py -h +``` + +**More interesting results can be found in our [Homepage](https://liveportrait.github.io)** ๐Ÿ˜Š + +### 4. Gradio interface + +We also provide a Gradio interface for a better experience, just run by: + +```bash +python app.py +``` + +### 5. Inference speed evaluation ๐Ÿš€๐Ÿš€๐Ÿš€ +We have also provided a script to evaluate the inference speed of each module: + +```bash +python speed.py +``` + +Below are the results of inferring one frame on an RTX 4090 GPU using the native PyTorch framework with `torch.compile`: + +| Model | Parameters(M) | Model Size(MB) | Inference(ms) | +|-----------------------------------|:-------------:|:--------------:|:-------------:| +| Appearance Feature Extractor | 0.84 | 3.3 | 0.82 | +| Motion Extractor | 28.12 | 108 | 0.84 | +| Spade Generator | 55.37 | 212 | 7.59 | +| Warping Module | 45.53 | 174 | 5.21 | +| Stitching and Retargeting Modules| 0.23 | 2.3 | 0.31 | + +*Note: the listed values of Stitching and Retargeting Modules represent the combined parameter counts and the total sequential inference time of three MLP networks.* + + +## Acknowledgements +We would like to thank the contributors of [FOMM](https://github.com/AliaksandrSiarohin/first-order-model), [Open Facevid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis), [SPADE](https://github.com/NVlabs/SPADE), [InsightFace](https://github.com/deepinsight/insightface) repositories, for their open research and contributions. + +## Citation ๐Ÿ’– +If you find LivePortrait useful for your research, welcome to ๐ŸŒŸ this repo and cite our work using the following BibTeX: +```bibtex +@article{guo2024live, + title = {LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control}, + author = {Jianzhu Guo and Dingyun Zhang and Xiaoqiang Liu and Zhizhou Zhong and Yuan Zhang and Pengfei Wan and Di Zhang}, + year = {2024}, + journal = {arXiv preprint:2407.03168}, +} +``` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..73dbda952975ac8965774e441aa7ef1ccfde1a72 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +torch==2.3.0 +torchvision==0.18.0 +torchaudio==2.3.0 + +numpy==1.26.4 +pyyaml==6.0.1 +opencv-python==4.10.0.84 +scipy==1.13.1 +imageio==2.34.2 +lmdb==1.4.1 +tqdm==4.66.4 +rich==13.7.1 +ffmpeg==1.4 +onnxruntime-gpu==1.18.0 +onnx==1.16.1 +scikit-image==0.24.0 +albumentations==1.4.10 +matplotlib==3.9.0 +imageio-ffmpeg==0.5.1 +tyro==0.8.5 +gradio==4.37.1 diff --git a/speed.py b/speed.py new file mode 100644 index 0000000000000000000000000000000000000000..02459d2aab703bcc873123f80608a4074b24f9f0 --- /dev/null +++ b/speed.py @@ -0,0 +1,192 @@ +# coding: utf-8 + +""" +Benchmark the inference speed of each module in LivePortrait. + +TODO: heavy GPT style, need to refactor +""" + +import yaml +import torch +import time +import numpy as np +from src.utils.helper import load_model, concat_feat +from src.config.inference_config import InferenceConfig + + +def initialize_inputs(batch_size=1): + """ + Generate random input tensors and move them to GPU + """ + feature_3d = torch.randn(batch_size, 32, 16, 64, 64).cuda().half() + kp_source = torch.randn(batch_size, 21, 3).cuda().half() + kp_driving = torch.randn(batch_size, 21, 3).cuda().half() + source_image = torch.randn(batch_size, 3, 256, 256).cuda().half() + generator_input = torch.randn(batch_size, 256, 64, 64).cuda().half() + eye_close_ratio = torch.randn(batch_size, 3).cuda().half() + lip_close_ratio = torch.randn(batch_size, 2).cuda().half() + feat_stitching = concat_feat(kp_source, kp_driving).half() + feat_eye = concat_feat(kp_source, eye_close_ratio).half() + feat_lip = concat_feat(kp_source, lip_close_ratio).half() + + inputs = { + 'feature_3d': feature_3d, + 'kp_source': kp_source, + 'kp_driving': kp_driving, + 'source_image': source_image, + 'generator_input': generator_input, + 'feat_stitching': feat_stitching, + 'feat_eye': feat_eye, + 'feat_lip': feat_lip + } + + return inputs + + +def load_and_compile_models(cfg, model_config): + """ + Load and compile models for inference + """ + appearance_feature_extractor = load_model(cfg.checkpoint_F, model_config, cfg.device_id, 'appearance_feature_extractor') + motion_extractor = load_model(cfg.checkpoint_M, model_config, cfg.device_id, 'motion_extractor') + warping_module = load_model(cfg.checkpoint_W, model_config, cfg.device_id, 'warping_module') + spade_generator = load_model(cfg.checkpoint_G, model_config, cfg.device_id, 'spade_generator') + stitching_retargeting_module = load_model(cfg.checkpoint_S, model_config, cfg.device_id, 'stitching_retargeting_module') + + models_with_params = [ + ('Appearance Feature Extractor', appearance_feature_extractor), + ('Motion Extractor', motion_extractor), + ('Warping Network', warping_module), + ('SPADE Decoder', spade_generator) + ] + + compiled_models = {} + for name, model in models_with_params: + model = model.half() + model = torch.compile(model, mode='max-autotune') # Optimize for inference + model.eval() # Switch to evaluation mode + compiled_models[name] = model + + retargeting_models = ['stitching', 'eye', 'lip'] + for retarget in retargeting_models: + module = stitching_retargeting_module[retarget].half() + module = torch.compile(module, mode='max-autotune') # Optimize for inference + module.eval() # Switch to evaluation mode + stitching_retargeting_module[retarget] = module + + return compiled_models, stitching_retargeting_module + + +def warm_up_models(compiled_models, stitching_retargeting_module, inputs): + """ + Warm up models to prepare them for benchmarking + """ + print("Warm up start!") + with torch.no_grad(): + for _ in range(10): + compiled_models['Appearance Feature Extractor'](inputs['source_image']) + compiled_models['Motion Extractor'](inputs['source_image']) + compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source']) + compiled_models['SPADE Decoder'](inputs['generator_input']) # Adjust input as required + stitching_retargeting_module['stitching'](inputs['feat_stitching']) + stitching_retargeting_module['eye'](inputs['feat_eye']) + stitching_retargeting_module['lip'](inputs['feat_lip']) + print("Warm up end!") + + +def measure_inference_times(compiled_models, stitching_retargeting_module, inputs): + """ + Measure inference times for each model + """ + times = {name: [] for name in compiled_models.keys()} + times['Retargeting Models'] = [] + + overall_times = [] + + with torch.no_grad(): + for _ in range(100): + torch.cuda.synchronize() + overall_start = time.time() + + start = time.time() + compiled_models['Appearance Feature Extractor'](inputs['source_image']) + torch.cuda.synchronize() + times['Appearance Feature Extractor'].append(time.time() - start) + + start = time.time() + compiled_models['Motion Extractor'](inputs['source_image']) + torch.cuda.synchronize() + times['Motion Extractor'].append(time.time() - start) + + start = time.time() + compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source']) + torch.cuda.synchronize() + times['Warping Network'].append(time.time() - start) + + start = time.time() + compiled_models['SPADE Decoder'](inputs['generator_input']) # Adjust input as required + torch.cuda.synchronize() + times['SPADE Decoder'].append(time.time() - start) + + start = time.time() + stitching_retargeting_module['stitching'](inputs['feat_stitching']) + stitching_retargeting_module['eye'](inputs['feat_eye']) + stitching_retargeting_module['lip'](inputs['feat_lip']) + torch.cuda.synchronize() + times['Retargeting Models'].append(time.time() - start) + + overall_times.append(time.time() - overall_start) + + return times, overall_times + + +def print_benchmark_results(compiled_models, stitching_retargeting_module, retargeting_models, times, overall_times): + """ + Print benchmark results with average and standard deviation of inference times + """ + average_times = {name: np.mean(times[name]) * 1000 for name in times.keys()} + std_times = {name: np.std(times[name]) * 1000 for name in times.keys()} + + for name, model in compiled_models.items(): + num_params = sum(p.numel() for p in model.parameters()) + num_params_in_millions = num_params / 1e6 + print(f"Number of parameters for {name}: {num_params_in_millions:.2f} M") + + for index, retarget in enumerate(retargeting_models): + num_params = sum(p.numel() for p in stitching_retargeting_module[retarget].parameters()) + num_params_in_millions = num_params / 1e6 + print(f"Number of parameters for part_{index} in Stitching and Retargeting Modules: {num_params_in_millions:.2f} M") + + for name, avg_time in average_times.items(): + std_time = std_times[name] + print(f"Average inference time for {name} over 100 runs: {avg_time:.2f} ms (std: {std_time:.2f} ms)") + + +def main(): + """ + Main function to benchmark speed and model parameters + """ + # Sample input tensors + inputs = initialize_inputs() + + # Load configuration + cfg = InferenceConfig(device_id=0) + model_config_path = cfg.models_config + with open(model_config_path, 'r') as file: + model_config = yaml.safe_load(file) + + # Load and compile models + compiled_models, stitching_retargeting_module = load_and_compile_models(cfg, model_config) + + # Warm up models + warm_up_models(compiled_models, stitching_retargeting_module, inputs) + + # Measure inference times + times, overall_times = measure_inference_times(compiled_models, stitching_retargeting_module, inputs) + + # Print benchmark results + print_benchmark_results(compiled_models, stitching_retargeting_module, ['stitching', 'eye', 'lip'], times, overall_times) + + +if __name__ == "__main__": + main() diff --git a/src/config/__init__.py b/src/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/config/argument_config.py b/src/config/argument_config.py new file mode 100644 index 0000000000000000000000000000000000000000..043162742ce65227076501c834289fda8d24a0cf --- /dev/null +++ b/src/config/argument_config.py @@ -0,0 +1,44 @@ +# coding: utf-8 + +""" +config for user +""" + +import os.path as osp +from dataclasses import dataclass +import tyro +from typing_extensions import Annotated +from .base_config import PrintableConfig, make_abs_path + + +@dataclass(repr=False) # use repr from PrintableConfig +class ArgumentConfig(PrintableConfig): + ########## input arguments ########## + source_image: Annotated[str, tyro.conf.arg(aliases=["-s"])] = make_abs_path('../../assets/examples/source/s6.jpg') # path to the source portrait + driving_info: Annotated[str, tyro.conf.arg(aliases=["-d"])] = make_abs_path('../../assets/examples/driving/d0.mp4') # path to driving video or template (.pkl format) + output_dir: Annotated[str, tyro.conf.arg(aliases=["-o"])] = 'animations/' # directory to save output video + ##################################### + + ########## inference arguments ########## + device_id: int = 0 + flag_lip_zero : bool = True # whether let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False + flag_eye_retargeting: bool = False + flag_lip_retargeting: bool = False + flag_stitching: bool = True # we recommend setting it to True! + flag_relative: bool = True # whether to use relative motion + flag_pasteback: bool = True # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space + flag_do_crop: bool = True # whether to crop the source portrait to the face-cropping space + flag_do_rot: bool = True # whether to conduct the rotation when flag_do_crop is True + ######################################### + + ########## crop arguments ########## + dsize: int = 512 + scale: float = 2.3 + vx_ratio: float = 0 # vx ratio + vy_ratio: float = -0.125 # vy ratio +up, -down + #################################### + + ########## gradio arguments ########## + server_port: Annotated[int, tyro.conf.arg(aliases=["-p"])] = 8890 + share: bool = True + server_name: str = "0.0.0.0" diff --git a/src/config/base_config.py b/src/config/base_config.py new file mode 100644 index 0000000000000000000000000000000000000000..216b8be50aecc8af4b9d1d2a9401e034dd7769e4 --- /dev/null +++ b/src/config/base_config.py @@ -0,0 +1,29 @@ +# coding: utf-8 + +""" +pretty printing class +""" + +from __future__ import annotations +import os.path as osp +from typing import Tuple + + +def make_abs_path(fn): + return osp.join(osp.dirname(osp.realpath(__file__)), fn) + + +class PrintableConfig: # pylint: disable=too-few-public-methods + """Printable Config defining str function""" + + def __repr__(self): + lines = [self.__class__.__name__ + ":"] + for key, val in vars(self).items(): + if isinstance(val, Tuple): + flattened_val = "[" + for item in val: + flattened_val += str(item) + "\n" + flattened_val = flattened_val.rstrip("\n") + val = flattened_val + "]" + lines += f"{key}: {str(val)}".split("\n") + return "\n ".join(lines) diff --git a/src/config/crop_config.py b/src/config/crop_config.py new file mode 100644 index 0000000000000000000000000000000000000000..d3c79be214ec6018ef4af298e306c47bd2a187f8 --- /dev/null +++ b/src/config/crop_config.py @@ -0,0 +1,18 @@ +# coding: utf-8 + +""" +parameters used for crop faces +""" + +import os.path as osp +from dataclasses import dataclass +from typing import Union, List +from .base_config import PrintableConfig + + +@dataclass(repr=False) # use repr from PrintableConfig +class CropConfig(PrintableConfig): + dsize: int = 512 # crop size + scale: float = 2.3 # scale factor + vx_ratio: float = 0 # vx ratio + vy_ratio: float = -0.125 # vy ratio +up, -down diff --git a/src/config/inference_config.py b/src/config/inference_config.py new file mode 100644 index 0000000000000000000000000000000000000000..e94aeb8f4dae67687bae5910115aa27421e67e5a --- /dev/null +++ b/src/config/inference_config.py @@ -0,0 +1,49 @@ +# coding: utf-8 + +""" +config dataclass used for inference +""" + +import os.path as osp +from dataclasses import dataclass +from typing import Literal, Tuple +from .base_config import PrintableConfig, make_abs_path + + +@dataclass(repr=False) # use repr from PrintableConfig +class InferenceConfig(PrintableConfig): + models_config: str = make_abs_path('./models.yaml') # portrait animation config + checkpoint_F: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/appearance_feature_extractor.pth') # path to checkpoint + checkpoint_M: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/motion_extractor.pth') # path to checkpoint + checkpoint_G: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/spade_generator.pth') # path to checkpoint + checkpoint_W: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/warping_module.pth') # path to checkpoint + + checkpoint_S: str = make_abs_path('../../pretrained_weights/liveportrait/retargeting_models/stitching_retargeting_module.pth') # path to checkpoint + flag_use_half_precision: bool = True # whether to use half precision + + flag_lip_zero: bool = True # whether let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False + lip_zero_threshold: float = 0.03 + + flag_eye_retargeting: bool = False + flag_lip_retargeting: bool = False + flag_stitching: bool = True # we recommend setting it to True! + + flag_relative: bool = True # whether to use relative motion + anchor_frame: int = 0 # set this value if find_best_frame is True + + input_shape: Tuple[int, int] = (256, 256) # input shape + output_format: Literal['mp4', 'gif'] = 'mp4' # output video format + output_fps: int = 30 # fps for output video + crf: int = 15 # crf for output video + + flag_write_result: bool = True # whether to write output video + flag_pasteback: bool = True # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space + mask_crop = None + flag_write_gif: bool = False + size_gif: int = 256 + ref_max_shape: int = 1280 + ref_shape_n: int = 2 + + device_id: int = 0 + flag_do_crop: bool = False # whether to crop the source portrait to the face-cropping space + flag_do_rot: bool = True # whether to conduct the rotation when flag_do_crop is True diff --git a/src/config/models.yaml b/src/config/models.yaml new file mode 100644 index 0000000000000000000000000000000000000000..131d1c65025c31e37af9239e211ea14454128a2e --- /dev/null +++ b/src/config/models.yaml @@ -0,0 +1,43 @@ +model_params: + appearance_feature_extractor_params: # the F in the paper + image_channel: 3 + block_expansion: 64 + num_down_blocks: 2 + max_features: 512 + reshape_channel: 32 + reshape_depth: 16 + num_resblocks: 6 + motion_extractor_params: # the M in the paper + num_kp: 21 + backbone: convnextv2_tiny + warping_module_params: # the W in the paper + num_kp: 21 + block_expansion: 64 + max_features: 512 + num_down_blocks: 2 + reshape_channel: 32 + estimate_occlusion_map: True + dense_motion_params: + block_expansion: 32 + max_features: 1024 + num_blocks: 5 + reshape_depth: 16 + compress: 4 + spade_generator_params: # the G in the paper + upscale: 2 # represents upsample factor 256x256 -> 512x512 + block_expansion: 64 + max_features: 512 + num_down_blocks: 2 + stitching_retargeting_module_params: # the S in the paper + stitching: + input_size: 126 # (21*3)*2 + hidden_sizes: [128, 128, 64] + output_size: 65 # (21*3)+2(tx,ty) + lip: + input_size: 65 # (21*3)+2 + hidden_sizes: [128, 128, 64] + output_size: 63 # (21*3) + eye: + input_size: 66 # (21*3)+3 + hidden_sizes: [256, 256, 128, 128, 64] + output_size: 63 # (21*3) diff --git a/src/gradio_pipeline.py b/src/gradio_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..c717897483380d69b4cd4622cf15eccde14a2eb0 --- /dev/null +++ b/src/gradio_pipeline.py @@ -0,0 +1,140 @@ +# coding: utf-8 + +""" +Pipeline for gradio +""" +import gradio as gr +from .config.argument_config import ArgumentConfig +from .live_portrait_pipeline import LivePortraitPipeline +from .utils.io import load_img_online +from .utils.rprint import rlog as log +from .utils.crop import prepare_paste_back, paste_back +from .utils.camera import get_rotation_matrix +from .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio + +def update_args(args, user_args): + """update the args according to user inputs + """ + for k, v in user_args.items(): + if hasattr(args, k): + setattr(args, k, v) + return args + +class GradioPipeline(LivePortraitPipeline): + + def __init__(self, inference_cfg, crop_cfg, args: ArgumentConfig): + super().__init__(inference_cfg, crop_cfg) + # self.live_portrait_wrapper = self.live_portrait_wrapper + self.args = args + # for single image retargeting + self.start_prepare = False + self.f_s_user = None + self.x_c_s_info_user = None + self.x_s_user = None + self.source_lmk_user = None + self.mask_ori = None + self.img_rgb = None + self.crop_M_c2o = None + + + def execute_video( + self, + input_image_path, + input_video_path, + flag_relative_input, + flag_do_crop_input, + flag_remap_input, + ): + """ for video driven potrait animation + """ + if input_image_path is not None and input_video_path is not None: + args_user = { + 'source_image': input_image_path, + 'driving_info': input_video_path, + 'flag_relative': flag_relative_input, + 'flag_do_crop': flag_do_crop_input, + 'flag_pasteback': flag_remap_input, + } + # update config from user input + self.args = update_args(self.args, args_user) + self.live_portrait_wrapper.update_config(self.args.__dict__) + self.cropper.update_config(self.args.__dict__) + # video driven animation + video_path, video_path_concat = self.execute(self.args) + gr.Info("Run successfully!", duration=2) + return video_path, video_path_concat, + else: + raise gr.Error("The input source portrait or driving video hasn't been prepared yet ๐Ÿ’ฅ!", duration=5) + + def execute_image(self, input_eye_ratio: float, input_lip_ratio: float): + """ for single image retargeting + """ + if input_eye_ratio is None or input_eye_ratio is None: + raise gr.Error("Invalid ratio input ๐Ÿ’ฅ!", duration=5) + elif self.f_s_user is None: + if self.start_prepare: + raise gr.Error( + "The source portrait is under processing ๐Ÿ’ฅ! Please wait for a second.", + duration=5 + ) + else: + raise gr.Error( + "The source portrait hasn't been prepared yet ๐Ÿ’ฅ! Please scroll to the top of the page to upload.", + duration=5 + ) + else: + # โˆ†_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i) + combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio([[input_eye_ratio]], self.source_lmk_user) + eyes_delta = self.live_portrait_wrapper.retarget_eye(self.x_s_user, combined_eye_ratio_tensor) + # โˆ†_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i) + combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio([[input_lip_ratio]], self.source_lmk_user) + lip_delta = self.live_portrait_wrapper.retarget_lip(self.x_s_user, combined_lip_ratio_tensor) + num_kp = self.x_s_user.shape[1] + # default: use x_s + x_d_new = self.x_s_user + eyes_delta.reshape(-1, num_kp, 3) + lip_delta.reshape(-1, num_kp, 3) + # D(W(f_s; x_s, xโ€ฒ_d)) + out = self.live_portrait_wrapper.warp_decode(self.f_s_user, self.x_s_user, x_d_new) + out = self.live_portrait_wrapper.parse_output(out['out'])[0] + out_to_ori_blend = paste_back(out, self.crop_M_c2o, self.img_rgb, self.mask_ori) + gr.Info("Run successfully!", duration=2) + return out, out_to_ori_blend + + + def prepare_retargeting(self, input_image_path, flag_do_crop = True): + """ for single image retargeting + """ + if input_image_path is not None: + gr.Info("Upload successfully!", duration=2) + self.start_prepare = True + inference_cfg = self.live_portrait_wrapper.cfg + ######## process source portrait ######## + img_rgb = load_img_online(input_image_path, mode='rgb', max_dim=1280, n=16) + log(f"Load source image from {input_image_path}.") + crop_info = self.cropper.crop_single_image(img_rgb) + if flag_do_crop: + I_s = self.live_portrait_wrapper.prepare_source(crop_info['img_crop_256x256']) + else: + I_s = self.live_portrait_wrapper.prepare_source(img_rgb) + x_s_info = self.live_portrait_wrapper.get_kp_info(I_s) + R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll']) + ############################################ + + # record global info for next time use + self.f_s_user = self.live_portrait_wrapper.extract_feature_3d(I_s) + self.x_s_user = self.live_portrait_wrapper.transform_keypoint(x_s_info) + self.x_s_info_user = x_s_info + self.source_lmk_user = crop_info['lmk_crop'] + self.img_rgb = img_rgb + self.crop_M_c2o = crop_info['M_c2o'] + self.mask_ori = prepare_paste_back(inference_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0])) + # update slider + eye_close_ratio = calc_eye_close_ratio(self.source_lmk_user[None]) + eye_close_ratio = float(eye_close_ratio.squeeze(0).mean()) + lip_close_ratio = calc_lip_close_ratio(self.source_lmk_user[None]) + lip_close_ratio = float(lip_close_ratio.squeeze(0).mean()) + # for vis + self.I_s_vis = self.live_portrait_wrapper.parse_output(I_s)[0] + return eye_close_ratio, lip_close_ratio, self.I_s_vis + else: + # when press the clear button, go here + return 0.8, 0.8, self.I_s_vis diff --git a/src/live_portrait_pipeline.py b/src/live_portrait_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..7fda1f5b7f2b31d3b00336a93e7c4cc51ac4b09c --- /dev/null +++ b/src/live_portrait_pipeline.py @@ -0,0 +1,190 @@ +# coding: utf-8 + +""" +Pipeline of LivePortrait +""" + +# TODO: +# 1. ๅฝ“ๅ‰ๅ‡ๅฎšๆ‰€ๆœ‰็š„ๆจกๆฟ้ƒฝๆ˜ฏๅทฒ็ป่ฃๅฅฝ็š„๏ผŒ้œ€่ฆไฟฎๆ”นไธ‹ +# 2. pickๆ ทไพ‹ๅ›พ source + driving + +import cv2 +import numpy as np +import pickle +import os.path as osp +from rich.progress import track + +from .config.argument_config import ArgumentConfig +from .config.inference_config import InferenceConfig +from .config.crop_config import CropConfig +from .utils.cropper import Cropper +from .utils.camera import get_rotation_matrix +from .utils.video import images2video, concat_frames +from .utils.crop import _transform_img, prepare_paste_back, paste_back +from .utils.retargeting_utils import calc_lip_close_ratio +from .utils.io import load_image_rgb, load_driving_info, resize_to_limit +from .utils.helper import mkdir, basename, dct2cuda, is_video, is_template +from .utils.rprint import rlog as log +from .live_portrait_wrapper import LivePortraitWrapper + + +def make_abs_path(fn): + return osp.join(osp.dirname(osp.realpath(__file__)), fn) + + +class LivePortraitPipeline(object): + + def __init__(self, inference_cfg: InferenceConfig, crop_cfg: CropConfig): + self.live_portrait_wrapper: LivePortraitWrapper = LivePortraitWrapper(cfg=inference_cfg) + self.cropper = Cropper(crop_cfg=crop_cfg) + + def execute(self, args: ArgumentConfig): + inference_cfg = self.live_portrait_wrapper.cfg # for convenience + ######## process source portrait ######## + img_rgb = load_image_rgb(args.source_image) + img_rgb = resize_to_limit(img_rgb, inference_cfg.ref_max_shape, inference_cfg.ref_shape_n) + log(f"Load source image from {args.source_image}") + crop_info = self.cropper.crop_single_image(img_rgb) + source_lmk = crop_info['lmk_crop'] + img_crop, img_crop_256x256 = crop_info['img_crop'], crop_info['img_crop_256x256'] + if inference_cfg.flag_do_crop: + I_s = self.live_portrait_wrapper.prepare_source(img_crop_256x256) + else: + I_s = self.live_portrait_wrapper.prepare_source(img_rgb) + x_s_info = self.live_portrait_wrapper.get_kp_info(I_s) + x_c_s = x_s_info['kp'] + R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll']) + f_s = self.live_portrait_wrapper.extract_feature_3d(I_s) + x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info) + + if inference_cfg.flag_lip_zero: + # let lip-open scalar to be 0 at first + c_d_lip_before_animation = [0.] + combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk) + if combined_lip_ratio_tensor_before_animation[0][0] < inference_cfg.lip_zero_threshold: + inference_cfg.flag_lip_zero = False + else: + lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation) + ############################################ + + ######## process driving info ######## + if is_video(args.driving_info): + log(f"Load from video file (mp4 mov avi etc...): {args.driving_info}") + # TODO: ่ฟ™้‡Œtrackไธ€ไธ‹้ฉฑๅŠจ่ง†้ข‘ -> ๆž„ๅปบๆจกๆฟ + driving_rgb_lst = load_driving_info(args.driving_info) + driving_rgb_lst_256 = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst] + I_d_lst = self.live_portrait_wrapper.prepare_driving_videos(driving_rgb_lst_256) + n_frames = I_d_lst.shape[0] + if inference_cfg.flag_eye_retargeting or inference_cfg.flag_lip_retargeting: + driving_lmk_lst = self.cropper.get_retargeting_lmk_info(driving_rgb_lst) + input_eye_ratio_lst, input_lip_ratio_lst = self.live_portrait_wrapper.calc_retargeting_ratio(source_lmk, driving_lmk_lst) + elif is_template(args.driving_info): + log(f"Load from video templates {args.driving_info}") + with open(args.driving_info, 'rb') as f: + template_lst, driving_lmk_lst = pickle.load(f) + n_frames = template_lst[0]['n_frames'] + input_eye_ratio_lst, input_lip_ratio_lst = self.live_portrait_wrapper.calc_retargeting_ratio(source_lmk, driving_lmk_lst) + else: + raise Exception("Unsupported driving types!") + ######################################### + + ######## prepare for pasteback ######## + if inference_cfg.flag_pasteback: + mask_ori = prepare_paste_back(inference_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0])) + I_p_paste_lst = [] + ######################################### + + I_p_lst = [] + R_d_0, x_d_0_info = None, None + for i in track(range(n_frames), description='Animating...', total=n_frames): + if is_video(args.driving_info): + # extract kp info by M + I_d_i = I_d_lst[i] + x_d_i_info = self.live_portrait_wrapper.get_kp_info(I_d_i) + R_d_i = get_rotation_matrix(x_d_i_info['pitch'], x_d_i_info['yaw'], x_d_i_info['roll']) + else: + # from template + x_d_i_info = template_lst[i] + x_d_i_info = dct2cuda(x_d_i_info, inference_cfg.device_id) + R_d_i = x_d_i_info['R_d'] + + if i == 0: + R_d_0 = R_d_i + x_d_0_info = x_d_i_info + + if inference_cfg.flag_relative: + R_new = (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s + delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp']) + scale_new = x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale']) + t_new = x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t']) + else: + R_new = R_d_i + delta_new = x_d_i_info['exp'] + scale_new = x_s_info['scale'] + t_new = x_d_i_info['t'] + + t_new[..., 2].fill_(0) # zero tz + x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new + + # Algorithm 1: + if not inference_cfg.flag_stitching and not inference_cfg.flag_eye_retargeting and not inference_cfg.flag_lip_retargeting: + # without stitching or retargeting + if inference_cfg.flag_lip_zero: + x_d_i_new += lip_delta_before_animation.reshape(-1, x_s.shape[1], 3) + else: + pass + elif inference_cfg.flag_stitching and not inference_cfg.flag_eye_retargeting and not inference_cfg.flag_lip_retargeting: + # with stitching and without retargeting + if inference_cfg.flag_lip_zero: + x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new) + lip_delta_before_animation.reshape(-1, x_s.shape[1], 3) + else: + x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new) + else: + eyes_delta, lip_delta = None, None + if inference_cfg.flag_eye_retargeting: + c_d_eyes_i = input_eye_ratio_lst[i] + combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio(c_d_eyes_i, source_lmk) + # โˆ†_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i) + eyes_delta = self.live_portrait_wrapper.retarget_eye(x_s, combined_eye_ratio_tensor) + if inference_cfg.flag_lip_retargeting: + c_d_lip_i = input_lip_ratio_lst[i] + combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_i, source_lmk) + # โˆ†_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i) + lip_delta = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor) + + if inference_cfg.flag_relative: # use x_s + x_d_i_new = x_s + \ + (eyes_delta.reshape(-1, x_s.shape[1], 3) if eyes_delta is not None else 0) + \ + (lip_delta.reshape(-1, x_s.shape[1], 3) if lip_delta is not None else 0) + else: # use x_d,i + x_d_i_new = x_d_i_new + \ + (eyes_delta.reshape(-1, x_s.shape[1], 3) if eyes_delta is not None else 0) + \ + (lip_delta.reshape(-1, x_s.shape[1], 3) if lip_delta is not None else 0) + + if inference_cfg.flag_stitching: + x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new) + + out = self.live_portrait_wrapper.warp_decode(f_s, x_s, x_d_i_new) + I_p_i = self.live_portrait_wrapper.parse_output(out['out'])[0] + I_p_lst.append(I_p_i) + + if inference_cfg.flag_pasteback: + I_p_i_to_ori_blend = paste_back(I_p_i, crop_info['M_c2o'], img_rgb, mask_ori) + I_p_paste_lst.append(I_p_i_to_ori_blend) + + mkdir(args.output_dir) + wfp_concat = None + if is_video(args.driving_info): + frames_concatenated = concat_frames(I_p_lst, driving_rgb_lst, img_crop_256x256) + # save (driving frames, source image, drived frames) result + wfp_concat = osp.join(args.output_dir, f'{basename(args.source_image)}--{basename(args.driving_info)}_concat.mp4') + images2video(frames_concatenated, wfp=wfp_concat) + + # save drived result + wfp = osp.join(args.output_dir, f'{basename(args.source_image)}--{basename(args.driving_info)}.mp4') + if inference_cfg.flag_pasteback: + images2video(I_p_paste_lst, wfp=wfp) + else: + images2video(I_p_lst, wfp=wfp) + + return wfp, wfp_concat diff --git a/src/live_portrait_wrapper.py b/src/live_portrait_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..0ad9d0645c23bd1b456508822d277921a65ecbe6 --- /dev/null +++ b/src/live_portrait_wrapper.py @@ -0,0 +1,307 @@ +# coding: utf-8 + +""" +Wrapper for LivePortrait core functions +""" + +import os.path as osp +import numpy as np +import cv2 +import torch +import yaml + +from .utils.timer import Timer +from .utils.helper import load_model, concat_feat +from .utils.camera import headpose_pred_to_degree, get_rotation_matrix +from .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio +from .config.inference_config import InferenceConfig +from .utils.rprint import rlog as log + + +class LivePortraitWrapper(object): + + def __init__(self, cfg: InferenceConfig): + + model_config = yaml.load(open(cfg.models_config, 'r'), Loader=yaml.SafeLoader) + + # init F + self.appearance_feature_extractor = load_model(cfg.checkpoint_F, model_config, cfg.device_id, 'appearance_feature_extractor') + log(f'Load appearance_feature_extractor done.') + # init M + self.motion_extractor = load_model(cfg.checkpoint_M, model_config, cfg.device_id, 'motion_extractor') + log(f'Load motion_extractor done.') + # init W + self.warping_module = load_model(cfg.checkpoint_W, model_config, cfg.device_id, 'warping_module') + log(f'Load warping_module done.') + # init G + self.spade_generator = load_model(cfg.checkpoint_G, model_config, cfg.device_id, 'spade_generator') + log(f'Load spade_generator done.') + # init S and R + if cfg.checkpoint_S is not None and osp.exists(cfg.checkpoint_S): + self.stitching_retargeting_module = load_model(cfg.checkpoint_S, model_config, cfg.device_id, 'stitching_retargeting_module') + log(f'Load stitching_retargeting_module done.') + else: + self.stitching_retargeting_module = None + + self.cfg = cfg + self.device_id = cfg.device_id + self.timer = Timer() + + def update_config(self, user_args): + for k, v in user_args.items(): + if hasattr(self.cfg, k): + setattr(self.cfg, k, v) + + def prepare_source(self, img: np.ndarray) -> torch.Tensor: + """ construct the input as standard + img: HxWx3, uint8, 256x256 + """ + h, w = img.shape[:2] + if h != self.cfg.input_shape[0] or w != self.cfg.input_shape[1]: + x = cv2.resize(img, (self.cfg.input_shape[0], self.cfg.input_shape[1])) + else: + x = img.copy() + + if x.ndim == 3: + x = x[np.newaxis].astype(np.float32) / 255. # HxWx3 -> 1xHxWx3, normalized to 0~1 + elif x.ndim == 4: + x = x.astype(np.float32) / 255. # BxHxWx3, normalized to 0~1 + else: + raise ValueError(f'img ndim should be 3 or 4: {x.ndim}') + x = np.clip(x, 0, 1) # clip to 0~1 + x = torch.from_numpy(x).permute(0, 3, 1, 2) # 1xHxWx3 -> 1x3xHxW + x = x.cuda(self.device_id) + return x + + def prepare_driving_videos(self, imgs) -> torch.Tensor: + """ construct the input as standard + imgs: NxBxHxWx3, uint8 + """ + if isinstance(imgs, list): + _imgs = np.array(imgs)[..., np.newaxis] # TxHxWx3x1 + elif isinstance(imgs, np.ndarray): + _imgs = imgs + else: + raise ValueError(f'imgs type error: {type(imgs)}') + + y = _imgs.astype(np.float32) / 255. + y = np.clip(y, 0, 1) # clip to 0~1 + y = torch.from_numpy(y).permute(0, 4, 3, 1, 2) # TxHxWx3x1 -> Tx1x3xHxW + y = y.cuda(self.device_id) + + return y + + def extract_feature_3d(self, x: torch.Tensor) -> torch.Tensor: + """ get the appearance feature of the image by F + x: Bx3xHxW, normalized to 0~1 + """ + with torch.no_grad(): + with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.cfg.flag_use_half_precision): + feature_3d = self.appearance_feature_extractor(x) + + return feature_3d.float() + + def get_kp_info(self, x: torch.Tensor, **kwargs) -> dict: + """ get the implicit keypoint information + x: Bx3xHxW, normalized to 0~1 + flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape + return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp' + """ + with torch.no_grad(): + with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.cfg.flag_use_half_precision): + kp_info = self.motion_extractor(x) + + if self.cfg.flag_use_half_precision: + # float the dict + for k, v in kp_info.items(): + if isinstance(v, torch.Tensor): + kp_info[k] = v.float() + + flag_refine_info: bool = kwargs.get('flag_refine_info', True) + if flag_refine_info: + bs = kp_info['kp'].shape[0] + kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None] # Bx1 + kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None] # Bx1 + kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None] # Bx1 + kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3) # BxNx3 + kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3) # BxNx3 + + return kp_info + + def get_pose_dct(self, kp_info: dict) -> dict: + pose_dct = dict( + pitch=headpose_pred_to_degree(kp_info['pitch']).item(), + yaw=headpose_pred_to_degree(kp_info['yaw']).item(), + roll=headpose_pred_to_degree(kp_info['roll']).item(), + ) + return pose_dct + + def get_fs_and_kp_info(self, source_prepared, driving_first_frame): + + # get the canonical keypoints of source image by M + source_kp_info = self.get_kp_info(source_prepared, flag_refine_info=True) + source_rotation = get_rotation_matrix(source_kp_info['pitch'], source_kp_info['yaw'], source_kp_info['roll']) + + # get the canonical keypoints of first driving frame by M + driving_first_frame_kp_info = self.get_kp_info(driving_first_frame, flag_refine_info=True) + driving_first_frame_rotation = get_rotation_matrix( + driving_first_frame_kp_info['pitch'], + driving_first_frame_kp_info['yaw'], + driving_first_frame_kp_info['roll'] + ) + + # get feature volume by F + source_feature_3d = self.extract_feature_3d(source_prepared) + + return source_kp_info, source_rotation, source_feature_3d, driving_first_frame_kp_info, driving_first_frame_rotation + + def transform_keypoint(self, kp_info: dict): + """ + transform the implicit keypoints with the pose, shift, and expression deformation + kp: BxNx3 + """ + kp = kp_info['kp'] # (bs, k, 3) + pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll'] + + t, exp = kp_info['t'], kp_info['exp'] + scale = kp_info['scale'] + + pitch = headpose_pred_to_degree(pitch) + yaw = headpose_pred_to_degree(yaw) + roll = headpose_pred_to_degree(roll) + + bs = kp.shape[0] + if kp.ndim == 2: + num_kp = kp.shape[1] // 3 # Bx(num_kpx3) + else: + num_kp = kp.shape[1] # Bxnum_kpx3 + + rot_mat = get_rotation_matrix(pitch, yaw, roll) # (bs, 3, 3) + + # Eqn.2: s * (R * x_c,s + exp) + t + kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3) + kp_transformed *= scale[..., None] # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3) + kp_transformed[:, :, 0:2] += t[:, None, 0:2] # remove z, only apply tx ty + + return kp_transformed + + def retarget_eye(self, kp_source: torch.Tensor, eye_close_ratio: torch.Tensor) -> torch.Tensor: + """ + kp_source: BxNx3 + eye_close_ratio: Bx3 + Return: Bx(3*num_kp+2) + """ + feat_eye = concat_feat(kp_source, eye_close_ratio) + + with torch.no_grad(): + delta = self.stitching_retargeting_module['eye'](feat_eye) + + return delta + + def retarget_lip(self, kp_source: torch.Tensor, lip_close_ratio: torch.Tensor) -> torch.Tensor: + """ + kp_source: BxNx3 + lip_close_ratio: Bx2 + """ + feat_lip = concat_feat(kp_source, lip_close_ratio) + + with torch.no_grad(): + delta = self.stitching_retargeting_module['lip'](feat_lip) + + return delta + + def stitch(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor: + """ + kp_source: BxNx3 + kp_driving: BxNx3 + Return: Bx(3*num_kp+2) + """ + feat_stiching = concat_feat(kp_source, kp_driving) + + with torch.no_grad(): + delta = self.stitching_retargeting_module['stitching'](feat_stiching) + + return delta + + def stitching(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor: + """ conduct the stitching + kp_source: Bxnum_kpx3 + kp_driving: Bxnum_kpx3 + """ + + if self.stitching_retargeting_module is not None: + + bs, num_kp = kp_source.shape[:2] + + kp_driving_new = kp_driving.clone() + delta = self.stitch(kp_source, kp_driving_new) + + delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3) # 1x20x3 + delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2) # 1x1x2 + + kp_driving_new += delta_exp + kp_driving_new[..., :2] += delta_tx_ty + + return kp_driving_new + + return kp_driving + + def warp_decode(self, feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor: + """ get the image after the warping of the implicit keypoints + feature_3d: Bx32x16x64x64, feature volume + kp_source: BxNx3 + kp_driving: BxNx3 + """ + # The line 18 in Algorithm 1: D(W(f_s; x_s, xโ€ฒ_d,i)๏ผ‰ + with torch.no_grad(): + with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.cfg.flag_use_half_precision): + # get decoder input + ret_dct = self.warping_module(feature_3d, kp_source=kp_source, kp_driving=kp_driving) + # decode + ret_dct['out'] = self.spade_generator(feature=ret_dct['out']) + + # float the dict + if self.cfg.flag_use_half_precision: + for k, v in ret_dct.items(): + if isinstance(v, torch.Tensor): + ret_dct[k] = v.float() + + return ret_dct + + def parse_output(self, out: torch.Tensor) -> np.ndarray: + """ construct the output as standard + return: 1xHxWx3, uint8 + """ + out = np.transpose(out.data.cpu().numpy(), [0, 2, 3, 1]) # 1x3xHxW -> 1xHxWx3 + out = np.clip(out, 0, 1) # clip to 0~1 + out = np.clip(out * 255, 0, 255).astype(np.uint8) # 0~1 -> 0~255 + + return out + + def calc_retargeting_ratio(self, source_lmk, driving_lmk_lst): + input_eye_ratio_lst = [] + input_lip_ratio_lst = [] + for lmk in driving_lmk_lst: + # for eyes retargeting + input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None])) + # for lip retargeting + input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None])) + return input_eye_ratio_lst, input_lip_ratio_lst + + def calc_combined_eye_ratio(self, input_eye_ratio, source_lmk): + eye_close_ratio = calc_eye_close_ratio(source_lmk[None]) + eye_close_ratio_tensor = torch.from_numpy(eye_close_ratio).float().cuda(self.device_id) + input_eye_ratio_tensor = torch.Tensor([input_eye_ratio[0][0]]).reshape(1, 1).cuda(self.device_id) + # [c_s,eyes, c_d,eyes,i] + combined_eye_ratio_tensor = torch.cat([eye_close_ratio_tensor, input_eye_ratio_tensor], dim=1) + return combined_eye_ratio_tensor + + def calc_combined_lip_ratio(self, input_lip_ratio, source_lmk): + lip_close_ratio = calc_lip_close_ratio(source_lmk[None]) + lip_close_ratio_tensor = torch.from_numpy(lip_close_ratio).float().cuda(self.device_id) + # [c_s,lip, c_d,lip,i] + input_lip_ratio_tensor = torch.Tensor([input_lip_ratio[0]]).cuda(self.device_id) + if input_lip_ratio_tensor.shape != [1, 1]: + input_lip_ratio_tensor = input_lip_ratio_tensor.reshape(1, 1) + combined_lip_ratio_tensor = torch.cat([lip_close_ratio_tensor, input_lip_ratio_tensor], dim=1) + return combined_lip_ratio_tensor diff --git a/src/modules/__init__.py b/src/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/modules/appearance_feature_extractor.py b/src/modules/appearance_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..8d89e4f18a2fbe58447f52ab4c5e3f2011a4ec80 --- /dev/null +++ b/src/modules/appearance_feature_extractor.py @@ -0,0 +1,48 @@ +# coding: utf-8 + +""" +Appearance extractor(F) defined in paper, which maps the source image s to a 3D appearance feature volume. +""" + +import torch +from torch import nn +from .util import SameBlock2d, DownBlock2d, ResBlock3d + + +class AppearanceFeatureExtractor(nn.Module): + + def __init__(self, image_channel, block_expansion, num_down_blocks, max_features, reshape_channel, reshape_depth, num_resblocks): + super(AppearanceFeatureExtractor, self).__init__() + self.image_channel = image_channel + self.block_expansion = block_expansion + self.num_down_blocks = num_down_blocks + self.max_features = max_features + self.reshape_channel = reshape_channel + self.reshape_depth = reshape_depth + + self.first = SameBlock2d(image_channel, block_expansion, kernel_size=(3, 3), padding=(1, 1)) + + down_blocks = [] + for i in range(num_down_blocks): + in_features = min(max_features, block_expansion * (2 ** i)) + out_features = min(max_features, block_expansion * (2 ** (i + 1))) + down_blocks.append(DownBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1))) + self.down_blocks = nn.ModuleList(down_blocks) + + self.second = nn.Conv2d(in_channels=out_features, out_channels=max_features, kernel_size=1, stride=1) + + self.resblocks_3d = torch.nn.Sequential() + for i in range(num_resblocks): + self.resblocks_3d.add_module('3dr' + str(i), ResBlock3d(reshape_channel, kernel_size=3, padding=1)) + + def forward(self, source_image): + out = self.first(source_image) # Bx3x256x256 -> Bx64x256x256 + + for i in range(len(self.down_blocks)): + out = self.down_blocks[i](out) + out = self.second(out) + bs, c, h, w = out.shape # ->Bx512x64x64 + + f_s = out.view(bs, self.reshape_channel, self.reshape_depth, h, w) # ->Bx32x16x64x64 + f_s = self.resblocks_3d(f_s) # ->Bx32x16x64x64 + return f_s diff --git a/src/modules/convnextv2.py b/src/modules/convnextv2.py new file mode 100644 index 0000000000000000000000000000000000000000..83ea12662b607854915df8c7abb160b588d330b1 --- /dev/null +++ b/src/modules/convnextv2.py @@ -0,0 +1,149 @@ +# coding: utf-8 + +""" +This moudle is adapted to the ConvNeXtV2 version for the extraction of implicit keypoints, poses, and expression deformation. +""" + +import torch +import torch.nn as nn +# from timm.models.layers import trunc_normal_, DropPath +from .util import LayerNorm, DropPath, trunc_normal_, GRN + +__all__ = ['convnextv2_tiny'] + + +class Block(nn.Module): + """ ConvNeXtV2 Block. + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + """ + + def __init__(self, dim, drop_path=0.): + super().__init__() + self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.grn = GRN(4 * dim) + self.pwconv2 = nn.Linear(4 * dim, dim) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + input = x + x = self.dwconv(x) + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.grn(x) + x = self.pwconv2(x) + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = input + self.drop_path(x) + return x + + +class ConvNeXtV2(nn.Module): + """ ConvNeXt V2 + + Args: + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] + dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] + drop_path_rate (float): Stochastic depth rate. Default: 0. + head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. + """ + + def __init__( + self, + in_chans=3, + depths=[3, 3, 9, 3], + dims=[96, 192, 384, 768], + drop_path_rate=0., + **kwargs + ): + super().__init__() + self.depths = depths + self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers + stem = nn.Sequential( + nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), + LayerNorm(dims[0], eps=1e-6, data_format="channels_first") + ) + self.downsample_layers.append(stem) + for i in range(3): + downsample_layer = nn.Sequential( + LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), + nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2), + ) + self.downsample_layers.append(downsample_layer) + + self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks + dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] + cur = 0 + for i in range(4): + stage = nn.Sequential( + *[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])] + ) + self.stages.append(stage) + cur += depths[i] + + self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer + + # NOTE: the output semantic items + num_bins = kwargs.get('num_bins', 66) + num_kp = kwargs.get('num_kp', 24) # the number of implicit keypoints + self.fc_kp = nn.Linear(dims[-1], 3 * num_kp) # implicit keypoints + + # print('dims[-1]: ', dims[-1]) + self.fc_scale = nn.Linear(dims[-1], 1) # scale + self.fc_pitch = nn.Linear(dims[-1], num_bins) # pitch bins + self.fc_yaw = nn.Linear(dims[-1], num_bins) # yaw bins + self.fc_roll = nn.Linear(dims[-1], num_bins) # roll bins + self.fc_t = nn.Linear(dims[-1], 3) # translation + self.fc_exp = nn.Linear(dims[-1], 3 * num_kp) # expression / delta + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2d, nn.Linear)): + trunc_normal_(m.weight, std=.02) + nn.init.constant_(m.bias, 0) + + def forward_features(self, x): + for i in range(4): + x = self.downsample_layers[i](x) + x = self.stages[i](x) + return self.norm(x.mean([-2, -1])) # global average pooling, (N, C, H, W) -> (N, C) + + def forward(self, x): + x = self.forward_features(x) + + # implicit keypoints + kp = self.fc_kp(x) + + # pose and expression deformation + pitch = self.fc_pitch(x) + yaw = self.fc_yaw(x) + roll = self.fc_roll(x) + t = self.fc_t(x) + exp = self.fc_exp(x) + scale = self.fc_scale(x) + + ret_dct = { + 'pitch': pitch, + 'yaw': yaw, + 'roll': roll, + 't': t, + 'exp': exp, + 'scale': scale, + + 'kp': kp, # canonical keypoint + } + + return ret_dct + + +def convnextv2_tiny(**kwargs): + model = ConvNeXtV2(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs) + return model diff --git a/src/modules/dense_motion.py b/src/modules/dense_motion.py new file mode 100644 index 0000000000000000000000000000000000000000..0eec0c46345f8854b125a51eaee730bd4ee77f7d --- /dev/null +++ b/src/modules/dense_motion.py @@ -0,0 +1,104 @@ +# coding: utf-8 + +""" +The module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving +""" + +from torch import nn +import torch.nn.functional as F +import torch +from .util import Hourglass, make_coordinate_grid, kp2gaussian + + +class DenseMotionNetwork(nn.Module): + def __init__(self, block_expansion, num_blocks, max_features, num_kp, feature_channel, reshape_depth, compress, estimate_occlusion_map=True): + super(DenseMotionNetwork, self).__init__() + self.hourglass = Hourglass(block_expansion=block_expansion, in_features=(num_kp+1)*(compress+1), max_features=max_features, num_blocks=num_blocks) # ~60+G + + self.mask = nn.Conv3d(self.hourglass.out_filters, num_kp + 1, kernel_size=7, padding=3) # 65G! NOTE: computation cost is large + self.compress = nn.Conv3d(feature_channel, compress, kernel_size=1) # 0.8G + self.norm = nn.BatchNorm3d(compress, affine=True) + self.num_kp = num_kp + self.flag_estimate_occlusion_map = estimate_occlusion_map + + if self.flag_estimate_occlusion_map: + self.occlusion = nn.Conv2d(self.hourglass.out_filters*reshape_depth, 1, kernel_size=7, padding=3) + else: + self.occlusion = None + + def create_sparse_motions(self, feature, kp_driving, kp_source): + bs, _, d, h, w = feature.shape # (bs, 4, 16, 64, 64) + identity_grid = make_coordinate_grid((d, h, w), ref=kp_source) # (16, 64, 64, 3) + identity_grid = identity_grid.view(1, 1, d, h, w, 3) # (1, 1, d=16, h=64, w=64, 3) + coordinate_grid = identity_grid - kp_driving.view(bs, self.num_kp, 1, 1, 1, 3) + + k = coordinate_grid.shape[1] + + # NOTE: there lacks an one-order flow + driving_to_source = coordinate_grid + kp_source.view(bs, self.num_kp, 1, 1, 1, 3) # (bs, num_kp, d, h, w, 3) + + # adding background feature + identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1, 1) + sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1) # (bs, 1+num_kp, d, h, w, 3) + return sparse_motions + + def create_deformed_feature(self, feature, sparse_motions): + bs, _, d, h, w = feature.shape + feature_repeat = feature.unsqueeze(1).unsqueeze(1).repeat(1, self.num_kp+1, 1, 1, 1, 1, 1) # (bs, num_kp+1, 1, c, d, h, w) + feature_repeat = feature_repeat.view(bs * (self.num_kp+1), -1, d, h, w) # (bs*(num_kp+1), c, d, h, w) + sparse_motions = sparse_motions.view((bs * (self.num_kp+1), d, h, w, -1)) # (bs*(num_kp+1), d, h, w, 3) + sparse_deformed = F.grid_sample(feature_repeat, sparse_motions, align_corners=False) + sparse_deformed = sparse_deformed.view((bs, self.num_kp+1, -1, d, h, w)) # (bs, num_kp+1, c, d, h, w) + + return sparse_deformed + + def create_heatmap_representations(self, feature, kp_driving, kp_source): + spatial_size = feature.shape[3:] # (d=16, h=64, w=64) + gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=0.01) # (bs, num_kp, d, h, w) + gaussian_source = kp2gaussian(kp_source, spatial_size=spatial_size, kp_variance=0.01) # (bs, num_kp, d, h, w) + heatmap = gaussian_driving - gaussian_source # (bs, num_kp, d, h, w) + + # adding background feature + zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0], spatial_size[1], spatial_size[2]).type(heatmap.type()).to(heatmap.device) + heatmap = torch.cat([zeros, heatmap], dim=1) + heatmap = heatmap.unsqueeze(2) # (bs, 1+num_kp, 1, d, h, w) + return heatmap + + def forward(self, feature, kp_driving, kp_source): + bs, _, d, h, w = feature.shape # (bs, 32, 16, 64, 64) + + feature = self.compress(feature) # (bs, 4, 16, 64, 64) + feature = self.norm(feature) # (bs, 4, 16, 64, 64) + feature = F.relu(feature) # (bs, 4, 16, 64, 64) + + out_dict = dict() + + # 1. deform 3d feature + sparse_motion = self.create_sparse_motions(feature, kp_driving, kp_source) # (bs, 1+num_kp, d, h, w, 3) + deformed_feature = self.create_deformed_feature(feature, sparse_motion) # (bs, 1+num_kp, c=4, d=16, h=64, w=64) + + # 2. (bs, 1+num_kp, d, h, w) + heatmap = self.create_heatmap_representations(deformed_feature, kp_driving, kp_source) # (bs, 1+num_kp, 1, d, h, w) + + input = torch.cat([heatmap, deformed_feature], dim=2) # (bs, 1+num_kp, c=5, d=16, h=64, w=64) + input = input.view(bs, -1, d, h, w) # (bs, (1+num_kp)*c=105, d=16, h=64, w=64) + + prediction = self.hourglass(input) + + mask = self.mask(prediction) + mask = F.softmax(mask, dim=1) # (bs, 1+num_kp, d=16, h=64, w=64) + out_dict['mask'] = mask + mask = mask.unsqueeze(2) # (bs, num_kp+1, 1, d, h, w) + sparse_motion = sparse_motion.permute(0, 1, 5, 2, 3, 4) # (bs, num_kp+1, 3, d, h, w) + deformation = (sparse_motion * mask).sum(dim=1) # (bs, 3, d, h, w) mask take effect in this place + deformation = deformation.permute(0, 2, 3, 4, 1) # (bs, d, h, w, 3) + + out_dict['deformation'] = deformation + + if self.flag_estimate_occlusion_map: + bs, _, d, h, w = prediction.shape + prediction_reshape = prediction.view(bs, -1, h, w) + occlusion_map = torch.sigmoid(self.occlusion(prediction_reshape)) # Bx1x64x64 + out_dict['occlusion_map'] = occlusion_map + + return out_dict diff --git a/src/modules/motion_extractor.py b/src/modules/motion_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..b2982e53c52d9ec1e0bec0453cc05edb51a15d23 --- /dev/null +++ b/src/modules/motion_extractor.py @@ -0,0 +1,35 @@ +# coding: utf-8 + +""" +Motion extractor(M), which directly predicts the canonical keypoints, head pose and expression deformation of the input image +""" + +from torch import nn +import torch + +from .convnextv2 import convnextv2_tiny +from .util import filter_state_dict + +model_dict = { + 'convnextv2_tiny': convnextv2_tiny, +} + + +class MotionExtractor(nn.Module): + def __init__(self, **kwargs): + super(MotionExtractor, self).__init__() + + # default is convnextv2_base + backbone = kwargs.get('backbone', 'convnextv2_tiny') + self.detector = model_dict.get(backbone)(**kwargs) + + def load_pretrained(self, init_path: str): + if init_path not in (None, ''): + state_dict = torch.load(init_path, map_location=lambda storage, loc: storage)['model'] + state_dict = filter_state_dict(state_dict, remove_name='head') + ret = self.detector.load_state_dict(state_dict, strict=False) + print(f'Load pretrained model from {init_path}, ret: {ret}') + + def forward(self, x): + out = self.detector(x) + return out diff --git a/src/modules/spade_generator.py b/src/modules/spade_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..147a9aed0c7707fe6ae3d59ce1a30154ef75afcc --- /dev/null +++ b/src/modules/spade_generator.py @@ -0,0 +1,59 @@ +# coding: utf-8 + +""" +Spade decoder(G) defined in the paper, which input the warped feature to generate the animated image. +""" + +import torch +from torch import nn +import torch.nn.functional as F +from .util import SPADEResnetBlock + + +class SPADEDecoder(nn.Module): + def __init__(self, upscale=1, max_features=256, block_expansion=64, out_channels=64, num_down_blocks=2): + for i in range(num_down_blocks): + input_channels = min(max_features, block_expansion * (2 ** (i + 1))) + self.upscale = upscale + super().__init__() + norm_G = 'spadespectralinstance' + label_num_channels = input_channels # 256 + + self.fc = nn.Conv2d(input_channels, 2 * input_channels, 3, padding=1) + self.G_middle_0 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels) + self.G_middle_1 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels) + self.G_middle_2 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels) + self.G_middle_3 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels) + self.G_middle_4 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels) + self.G_middle_5 = SPADEResnetBlock(2 * input_channels, 2 * input_channels, norm_G, label_num_channels) + self.up_0 = SPADEResnetBlock(2 * input_channels, input_channels, norm_G, label_num_channels) + self.up_1 = SPADEResnetBlock(input_channels, out_channels, norm_G, label_num_channels) + self.up = nn.Upsample(scale_factor=2) + + if self.upscale is None or self.upscale <= 1: + self.conv_img = nn.Conv2d(out_channels, 3, 3, padding=1) + else: + self.conv_img = nn.Sequential( + nn.Conv2d(out_channels, 3 * (2 * 2), kernel_size=3, padding=1), + nn.PixelShuffle(upscale_factor=2) + ) + + def forward(self, feature): + seg = feature # Bx256x64x64 + x = self.fc(feature) # Bx512x64x64 + x = self.G_middle_0(x, seg) + x = self.G_middle_1(x, seg) + x = self.G_middle_2(x, seg) + x = self.G_middle_3(x, seg) + x = self.G_middle_4(x, seg) + x = self.G_middle_5(x, seg) + + x = self.up(x) # Bx512x64x64 -> Bx512x128x128 + x = self.up_0(x, seg) # Bx512x128x128 -> Bx256x128x128 + x = self.up(x) # Bx256x128x128 -> Bx256x256x256 + x = self.up_1(x, seg) # Bx256x256x256 -> Bx64x256x256 + + x = self.conv_img(F.leaky_relu(x, 2e-1)) # Bx64x256x256 -> Bx3xHxW + x = torch.sigmoid(x) # Bx3xHxW + + return x \ No newline at end of file diff --git a/src/modules/stitching_retargeting_network.py b/src/modules/stitching_retargeting_network.py new file mode 100644 index 0000000000000000000000000000000000000000..5f50b7cf5a21cd71c70a7bbaaa4b6b68b4762ea3 --- /dev/null +++ b/src/modules/stitching_retargeting_network.py @@ -0,0 +1,38 @@ +# coding: utf-8 + +""" +Stitching module(S) and two retargeting modules(R) defined in the paper. + +- The stitching module pastes the animated portrait back into the original image space without pixel misalignment, such as in +the stitching region. + +- The eyes retargeting module is designed to address the issue of incomplete eye closure during cross-id reenactment, especially +when a person with small eyes drives a person with larger eyes. + +- The lip retargeting module is designed similarly to the eye retargeting module, and can also normalize the input by ensuring that +the lips are in a closed state, which facilitates better animation driving. +""" +from torch import nn + + +class StitchingRetargetingNetwork(nn.Module): + def __init__(self, input_size, hidden_sizes, output_size): + super(StitchingRetargetingNetwork, self).__init__() + layers = [] + for i in range(len(hidden_sizes)): + if i == 0: + layers.append(nn.Linear(input_size, hidden_sizes[i])) + else: + layers.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i])) + layers.append(nn.ReLU(inplace=True)) + layers.append(nn.Linear(hidden_sizes[-1], output_size)) + self.mlp = nn.Sequential(*layers) + + def initialize_weights_to_zero(self): + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.zeros_(m.weight) + nn.init.zeros_(m.bias) + + def forward(self, x): + return self.mlp(x) diff --git a/src/modules/util.py b/src/modules/util.py new file mode 100644 index 0000000000000000000000000000000000000000..f83980b24372bee38779ceeb3349fca91735e56e --- /dev/null +++ b/src/modules/util.py @@ -0,0 +1,441 @@ +# coding: utf-8 + +""" +This file defines various neural network modules and utility functions, including convolutional and residual blocks, +normalizations, and functions for spatial transformation and tensor manipulation. +""" + +from torch import nn +import torch.nn.functional as F +import torch +import torch.nn.utils.spectral_norm as spectral_norm +import math +import warnings + + +def kp2gaussian(kp, spatial_size, kp_variance): + """ + Transform a keypoint into gaussian like representation + """ + mean = kp + + coordinate_grid = make_coordinate_grid(spatial_size, mean) + number_of_leading_dimensions = len(mean.shape) - 1 + shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape + coordinate_grid = coordinate_grid.view(*shape) + repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 1) + coordinate_grid = coordinate_grid.repeat(*repeats) + + # Preprocess kp shape + shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 3) + mean = mean.view(*shape) + + mean_sub = (coordinate_grid - mean) + + out = torch.exp(-0.5 * (mean_sub ** 2).sum(-1) / kp_variance) + + return out + + +def make_coordinate_grid(spatial_size, ref, **kwargs): + d, h, w = spatial_size + x = torch.arange(w).type(ref.dtype).to(ref.device) + y = torch.arange(h).type(ref.dtype).to(ref.device) + z = torch.arange(d).type(ref.dtype).to(ref.device) + + # NOTE: must be right-down-in + x = (2 * (x / (w - 1)) - 1) # the x axis faces to the right + y = (2 * (y / (h - 1)) - 1) # the y axis faces to the bottom + z = (2 * (z / (d - 1)) - 1) # the z axis faces to the inner + + yy = y.view(1, -1, 1).repeat(d, 1, w) + xx = x.view(1, 1, -1).repeat(d, h, 1) + zz = z.view(-1, 1, 1).repeat(1, h, w) + + meshed = torch.cat([xx.unsqueeze_(3), yy.unsqueeze_(3), zz.unsqueeze_(3)], 3) + + return meshed + + +class ConvT2d(nn.Module): + """ + Upsampling block for use in decoder. + """ + + def __init__(self, in_features, out_features, kernel_size=3, stride=2, padding=1, output_padding=1): + super(ConvT2d, self).__init__() + + self.convT = nn.ConvTranspose2d(in_features, out_features, kernel_size=kernel_size, stride=stride, + padding=padding, output_padding=output_padding) + self.norm = nn.InstanceNorm2d(out_features) + + def forward(self, x): + out = self.convT(x) + out = self.norm(out) + out = F.leaky_relu(out) + return out + + +class ResBlock3d(nn.Module): + """ + Res block, preserve spatial resolution. + """ + + def __init__(self, in_features, kernel_size, padding): + super(ResBlock3d, self).__init__() + self.conv1 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding) + self.conv2 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding) + self.norm1 = nn.BatchNorm3d(in_features, affine=True) + self.norm2 = nn.BatchNorm3d(in_features, affine=True) + + def forward(self, x): + out = self.norm1(x) + out = F.relu(out) + out = self.conv1(out) + out = self.norm2(out) + out = F.relu(out) + out = self.conv2(out) + out += x + return out + + +class UpBlock3d(nn.Module): + """ + Upsampling block for use in decoder. + """ + + def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1): + super(UpBlock3d, self).__init__() + + self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, + padding=padding, groups=groups) + self.norm = nn.BatchNorm3d(out_features, affine=True) + + def forward(self, x): + out = F.interpolate(x, scale_factor=(1, 2, 2)) + out = self.conv(out) + out = self.norm(out) + out = F.relu(out) + return out + + +class DownBlock2d(nn.Module): + """ + Downsampling block for use in encoder. + """ + + def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1): + super(DownBlock2d, self).__init__() + self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups) + self.norm = nn.BatchNorm2d(out_features, affine=True) + self.pool = nn.AvgPool2d(kernel_size=(2, 2)) + + def forward(self, x): + out = self.conv(x) + out = self.norm(out) + out = F.relu(out) + out = self.pool(out) + return out + + +class DownBlock3d(nn.Module): + """ + Downsampling block for use in encoder. + """ + + def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1): + super(DownBlock3d, self).__init__() + ''' + self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, + padding=padding, groups=groups, stride=(1, 2, 2)) + ''' + self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, + padding=padding, groups=groups) + self.norm = nn.BatchNorm3d(out_features, affine=True) + self.pool = nn.AvgPool3d(kernel_size=(1, 2, 2)) + + def forward(self, x): + out = self.conv(x) + out = self.norm(out) + out = F.relu(out) + out = self.pool(out) + return out + + +class SameBlock2d(nn.Module): + """ + Simple block, preserve spatial resolution. + """ + + def __init__(self, in_features, out_features, groups=1, kernel_size=3, padding=1, lrelu=False): + super(SameBlock2d, self).__init__() + self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups) + self.norm = nn.BatchNorm2d(out_features, affine=True) + if lrelu: + self.ac = nn.LeakyReLU() + else: + self.ac = nn.ReLU() + + def forward(self, x): + out = self.conv(x) + out = self.norm(out) + out = self.ac(out) + return out + + +class Encoder(nn.Module): + """ + Hourglass Encoder + """ + + def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): + super(Encoder, self).__init__() + + down_blocks = [] + for i in range(num_blocks): + down_blocks.append(DownBlock3d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)), min(max_features, block_expansion * (2 ** (i + 1))), kernel_size=3, padding=1)) + self.down_blocks = nn.ModuleList(down_blocks) + + def forward(self, x): + outs = [x] + for down_block in self.down_blocks: + outs.append(down_block(outs[-1])) + return outs + + +class Decoder(nn.Module): + """ + Hourglass Decoder + """ + + def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): + super(Decoder, self).__init__() + + up_blocks = [] + + for i in range(num_blocks)[::-1]: + in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1))) + out_filters = min(max_features, block_expansion * (2 ** i)) + up_blocks.append(UpBlock3d(in_filters, out_filters, kernel_size=3, padding=1)) + + self.up_blocks = nn.ModuleList(up_blocks) + self.out_filters = block_expansion + in_features + + self.conv = nn.Conv3d(in_channels=self.out_filters, out_channels=self.out_filters, kernel_size=3, padding=1) + self.norm = nn.BatchNorm3d(self.out_filters, affine=True) + + def forward(self, x): + out = x.pop() + for up_block in self.up_blocks: + out = up_block(out) + skip = x.pop() + out = torch.cat([out, skip], dim=1) + out = self.conv(out) + out = self.norm(out) + out = F.relu(out) + return out + + +class Hourglass(nn.Module): + """ + Hourglass architecture. + """ + + def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): + super(Hourglass, self).__init__() + self.encoder = Encoder(block_expansion, in_features, num_blocks, max_features) + self.decoder = Decoder(block_expansion, in_features, num_blocks, max_features) + self.out_filters = self.decoder.out_filters + + def forward(self, x): + return self.decoder(self.encoder(x)) + + +class SPADE(nn.Module): + def __init__(self, norm_nc, label_nc): + super().__init__() + + self.param_free_norm = nn.InstanceNorm2d(norm_nc, affine=False) + nhidden = 128 + + self.mlp_shared = nn.Sequential( + nn.Conv2d(label_nc, nhidden, kernel_size=3, padding=1), + nn.ReLU()) + self.mlp_gamma = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1) + self.mlp_beta = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1) + + def forward(self, x, segmap): + normalized = self.param_free_norm(x) + segmap = F.interpolate(segmap, size=x.size()[2:], mode='nearest') + actv = self.mlp_shared(segmap) + gamma = self.mlp_gamma(actv) + beta = self.mlp_beta(actv) + out = normalized * (1 + gamma) + beta + return out + + +class SPADEResnetBlock(nn.Module): + def __init__(self, fin, fout, norm_G, label_nc, use_se=False, dilation=1): + super().__init__() + # Attributes + self.learned_shortcut = (fin != fout) + fmiddle = min(fin, fout) + self.use_se = use_se + # create conv layers + self.conv_0 = nn.Conv2d(fin, fmiddle, kernel_size=3, padding=dilation, dilation=dilation) + self.conv_1 = nn.Conv2d(fmiddle, fout, kernel_size=3, padding=dilation, dilation=dilation) + if self.learned_shortcut: + self.conv_s = nn.Conv2d(fin, fout, kernel_size=1, bias=False) + # apply spectral norm if specified + if 'spectral' in norm_G: + self.conv_0 = spectral_norm(self.conv_0) + self.conv_1 = spectral_norm(self.conv_1) + if self.learned_shortcut: + self.conv_s = spectral_norm(self.conv_s) + # define normalization layers + self.norm_0 = SPADE(fin, label_nc) + self.norm_1 = SPADE(fmiddle, label_nc) + if self.learned_shortcut: + self.norm_s = SPADE(fin, label_nc) + + def forward(self, x, seg1): + x_s = self.shortcut(x, seg1) + dx = self.conv_0(self.actvn(self.norm_0(x, seg1))) + dx = self.conv_1(self.actvn(self.norm_1(dx, seg1))) + out = x_s + dx + return out + + def shortcut(self, x, seg1): + if self.learned_shortcut: + x_s = self.conv_s(self.norm_s(x, seg1)) + else: + x_s = x + return x_s + + def actvn(self, x): + return F.leaky_relu(x, 2e-1) + + +def filter_state_dict(state_dict, remove_name='fc'): + new_state_dict = {} + for key in state_dict: + if remove_name in key: + continue + new_state_dict[key] = state_dict[key] + return new_state_dict + + +class GRN(nn.Module): + """ GRN (Global Response Normalization) layer + """ + + def __init__(self, dim): + super().__init__() + self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim)) + self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim)) + + def forward(self, x): + Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True) + Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6) + return self.gamma * (x * Nx) + self.beta + x + + +class LayerNorm(nn.Module): + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def drop_path(x, drop_prob=0., training=False, scale_by_keep=True): + """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +class DropPath(nn.Module): + """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None, scale_by_keep=True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + return _no_grad_trunc_normal_(tensor, mean, std, a, b) diff --git a/src/modules/warping_network.py b/src/modules/warping_network.py new file mode 100644 index 0000000000000000000000000000000000000000..9191a197055a954272ee8ed86c5e34f3f33f9ad5 --- /dev/null +++ b/src/modules/warping_network.py @@ -0,0 +1,77 @@ +# coding: utf-8 + +""" +Warping field estimator(W) defined in the paper, which generates a warping field using the implicit +keypoint representations x_s and x_d, and employs this flow field to warp the source feature volume f_s. +""" + +from torch import nn +import torch.nn.functional as F +from .util import SameBlock2d +from .dense_motion import DenseMotionNetwork + + +class WarpingNetwork(nn.Module): + def __init__( + self, + num_kp, + block_expansion, + max_features, + num_down_blocks, + reshape_channel, + estimate_occlusion_map=False, + dense_motion_params=None, + **kwargs + ): + super(WarpingNetwork, self).__init__() + + self.upscale = kwargs.get('upscale', 1) + self.flag_use_occlusion_map = kwargs.get('flag_use_occlusion_map', True) + + if dense_motion_params is not None: + self.dense_motion_network = DenseMotionNetwork( + num_kp=num_kp, + feature_channel=reshape_channel, + estimate_occlusion_map=estimate_occlusion_map, + **dense_motion_params + ) + else: + self.dense_motion_network = None + + self.third = SameBlock2d(max_features, block_expansion * (2 ** num_down_blocks), kernel_size=(3, 3), padding=(1, 1), lrelu=True) + self.fourth = nn.Conv2d(in_channels=block_expansion * (2 ** num_down_blocks), out_channels=block_expansion * (2 ** num_down_blocks), kernel_size=1, stride=1) + + self.estimate_occlusion_map = estimate_occlusion_map + + def deform_input(self, inp, deformation): + return F.grid_sample(inp, deformation, align_corners=False) + + def forward(self, feature_3d, kp_driving, kp_source): + if self.dense_motion_network is not None: + # Feature warper, Transforming feature representation according to deformation and occlusion + dense_motion = self.dense_motion_network( + feature=feature_3d, kp_driving=kp_driving, kp_source=kp_source + ) + if 'occlusion_map' in dense_motion: + occlusion_map = dense_motion['occlusion_map'] # Bx1x64x64 + else: + occlusion_map = None + + deformation = dense_motion['deformation'] # Bx16x64x64x3 + out = self.deform_input(feature_3d, deformation) # Bx32x16x64x64 + + bs, c, d, h, w = out.shape # Bx32x16x64x64 + out = out.view(bs, c * d, h, w) # -> Bx512x64x64 + out = self.third(out) # -> Bx256x64x64 + out = self.fourth(out) # -> Bx256x64x64 + + if self.flag_use_occlusion_map and (occlusion_map is not None): + out = out * occlusion_map + + ret_dct = { + 'occlusion_map': occlusion_map, + 'deformation': deformation, + 'out': out, + } + + return ret_dct diff --git a/src/template_maker.py b/src/template_maker.py new file mode 100644 index 0000000000000000000000000000000000000000..7f3ce06201d6f9db98a299346a3324364196fad1 --- /dev/null +++ b/src/template_maker.py @@ -0,0 +1,65 @@ +# coding: utf-8 + +""" +Make video template +""" + +import os +import cv2 +import numpy as np +import pickle +from rich.progress import track +from .utils.cropper import Cropper + +from .utils.io import load_driving_info +from .utils.camera import get_rotation_matrix +from .utils.helper import mkdir, basename +from .utils.rprint import rlog as log +from .config.crop_config import CropConfig +from .config.inference_config import InferenceConfig +from .live_portrait_wrapper import LivePortraitWrapper + +class TemplateMaker: + + def __init__(self, inference_cfg: InferenceConfig, crop_cfg: CropConfig): + self.live_portrait_wrapper: LivePortraitWrapper = LivePortraitWrapper(cfg=inference_cfg) + self.cropper = Cropper(crop_cfg=crop_cfg) + + def make_motion_template(self, video_fp: str, output_path: str, **kwargs): + """ make video template (.pkl format) + video_fp: driving video file path + output_path: where to save the pickle file + """ + + driving_rgb_lst = load_driving_info(video_fp) + driving_rgb_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst] + driving_lmk_lst = self.cropper.get_retargeting_lmk_info(driving_rgb_lst) + I_d_lst = self.live_portrait_wrapper.prepare_driving_videos(driving_rgb_lst) + + n_frames = I_d_lst.shape[0] + + templates = [] + + + for i in track(range(n_frames), description='Making templates...', total=n_frames): + I_d_i = I_d_lst[i] + x_d_i_info = self.live_portrait_wrapper.get_kp_info(I_d_i) + R_d_i = get_rotation_matrix(x_d_i_info['pitch'], x_d_i_info['yaw'], x_d_i_info['roll']) + # collect s_d, R_d, ฮด_d and t_d for inference + template_dct = { + 'n_frames': n_frames, + 'frames_index': i, + } + template_dct['scale'] = x_d_i_info['scale'].cpu().numpy().astype(np.float32) + template_dct['R_d'] = R_d_i.cpu().numpy().astype(np.float32) + template_dct['exp'] = x_d_i_info['exp'].cpu().numpy().astype(np.float32) + template_dct['t'] = x_d_i_info['t'].cpu().numpy().astype(np.float32) + + templates.append(template_dct) + + mkdir(output_path) + # Save the dictionary as a pickle file + pickle_fp = os.path.join(output_path, f'{basename(video_fp)}.pkl') + with open(pickle_fp, 'wb') as f: + pickle.dump([templates, driving_lmk_lst], f) + log(f"Template saved at {pickle_fp}") diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/utils/camera.py b/src/utils/camera.py new file mode 100644 index 0000000000000000000000000000000000000000..8bbfc90ac87e99caf09bee69982761fdde753527 --- /dev/null +++ b/src/utils/camera.py @@ -0,0 +1,75 @@ +# coding: utf-8 + +""" +functions for processing and transforming 3D facial keypoints +""" + +import numpy as np +import torch +import torch.nn.functional as F + +PI = np.pi + + +def headpose_pred_to_degree(pred): + """ + pred: (bs, 66) or (bs, 1) or others + """ + if pred.ndim > 1 and pred.shape[1] == 66: + # NOTE: note that the average is modified to 97.5 + device = pred.device + idx_tensor = [idx for idx in range(0, 66)] + idx_tensor = torch.FloatTensor(idx_tensor).to(device) + pred = F.softmax(pred, dim=1) + degree = torch.sum(pred*idx_tensor, axis=1) * 3 - 97.5 + + return degree + + return pred + + +def get_rotation_matrix(pitch_, yaw_, roll_): + """ the input is in degree + """ + # calculate the rotation matrix: vps @ rot + + # transform to radian + pitch = pitch_ / 180 * PI + yaw = yaw_ / 180 * PI + roll = roll_ / 180 * PI + + device = pitch.device + + if pitch.ndim == 1: + pitch = pitch.unsqueeze(1) + if yaw.ndim == 1: + yaw = yaw.unsqueeze(1) + if roll.ndim == 1: + roll = roll.unsqueeze(1) + + # calculate the euler matrix + bs = pitch.shape[0] + ones = torch.ones([bs, 1]).to(device) + zeros = torch.zeros([bs, 1]).to(device) + x, y, z = pitch, yaw, roll + + rot_x = torch.cat([ + ones, zeros, zeros, + zeros, torch.cos(x), -torch.sin(x), + zeros, torch.sin(x), torch.cos(x) + ], dim=1).reshape([bs, 3, 3]) + + rot_y = torch.cat([ + torch.cos(y), zeros, torch.sin(y), + zeros, ones, zeros, + -torch.sin(y), zeros, torch.cos(y) + ], dim=1).reshape([bs, 3, 3]) + + rot_z = torch.cat([ + torch.cos(z), -torch.sin(z), zeros, + torch.sin(z), torch.cos(z), zeros, + zeros, zeros, ones + ], dim=1).reshape([bs, 3, 3]) + + rot = rot_z @ rot_y @ rot_x + return rot.permute(0, 2, 1) # transpose diff --git a/src/utils/crop.py b/src/utils/crop.py new file mode 100644 index 0000000000000000000000000000000000000000..8f233639fae332c502623e9a0e695af69379dc57 --- /dev/null +++ b/src/utils/crop.py @@ -0,0 +1,412 @@ +# coding: utf-8 + +""" +cropping function and the related preprocess functions for cropping +""" + +import numpy as np +import os.path as osp +from math import sin, cos, acos, degrees +import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False) # NOTE: enforce single thread +from .rprint import rprint as print + +DTYPE = np.float32 +CV2_INTERP = cv2.INTER_LINEAR + +def make_abs_path(fn): + return osp.join(osp.dirname(osp.realpath(__file__)), fn) + +def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None): + """ conduct similarity or affine transformation to the image, do not do border operation! + img: + M: 2x3 matrix or 3x3 matrix + dsize: target shape (width, height) + """ + if isinstance(dsize, tuple) or isinstance(dsize, list): + _dsize = tuple(dsize) + else: + _dsize = (dsize, dsize) + + if borderMode is not None: + return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0)) + else: + return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags) + + +def _transform_pts(pts, M): + """ conduct similarity or affine transformation to the pts + pts: Nx2 ndarray + M: 2x3 matrix or 3x3 matrix + return: Nx2 + """ + return pts @ M[:2, :2].T + M[:2, 2] + + +def parse_pt2_from_pt101(pt101, use_lip=True): + """ + parsing the 2 points according to the 101 points, which cancels the roll + """ + # the former version use the eye center, but it is not robust, now use interpolation + pt_left_eye = np.mean(pt101[[39, 42, 45, 48]], axis=0) # left eye center + pt_right_eye = np.mean(pt101[[51, 54, 57, 60]], axis=0) # right eye center + + if use_lip: + # use lip + pt_center_eye = (pt_left_eye + pt_right_eye) / 2 + pt_center_lip = (pt101[75] + pt101[81]) / 2 + pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0) + else: + pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0) + return pt2 + + +def parse_pt2_from_pt106(pt106, use_lip=True): + """ + parsing the 2 points according to the 106 points, which cancels the roll + """ + pt_left_eye = np.mean(pt106[[33, 35, 40, 39]], axis=0) # left eye center + pt_right_eye = np.mean(pt106[[87, 89, 94, 93]], axis=0) # right eye center + + if use_lip: + # use lip + pt_center_eye = (pt_left_eye + pt_right_eye) / 2 + pt_center_lip = (pt106[52] + pt106[61]) / 2 + pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0) + else: + pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0) + return pt2 + + +def parse_pt2_from_pt203(pt203, use_lip=True): + """ + parsing the 2 points according to the 203 points, which cancels the roll + """ + pt_left_eye = np.mean(pt203[[0, 6, 12, 18]], axis=0) # left eye center + pt_right_eye = np.mean(pt203[[24, 30, 36, 42]], axis=0) # right eye center + if use_lip: + # use lip + pt_center_eye = (pt_left_eye + pt_right_eye) / 2 + pt_center_lip = (pt203[48] + pt203[66]) / 2 + pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0) + else: + pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0) + return pt2 + + +def parse_pt2_from_pt68(pt68, use_lip=True): + """ + parsing the 2 points according to the 68 points, which cancels the roll + """ + lm_idx = np.array([31, 37, 40, 43, 46, 49, 55], dtype=np.int32) - 1 + if use_lip: + pt5 = np.stack([ + np.mean(pt68[lm_idx[[1, 2]], :], 0), # left eye + np.mean(pt68[lm_idx[[3, 4]], :], 0), # right eye + pt68[lm_idx[0], :], # nose + pt68[lm_idx[5], :], # lip + pt68[lm_idx[6], :] # lip + ], axis=0) + + pt2 = np.stack([ + (pt5[0] + pt5[1]) / 2, + (pt5[3] + pt5[4]) / 2 + ], axis=0) + else: + pt2 = np.stack([ + np.mean(pt68[lm_idx[[1, 2]], :], 0), # left eye + np.mean(pt68[lm_idx[[3, 4]], :], 0), # right eye + ], axis=0) + + return pt2 + + +def parse_pt2_from_pt5(pt5, use_lip=True): + """ + parsing the 2 points according to the 5 points, which cancels the roll + """ + if use_lip: + pt2 = np.stack([ + (pt5[0] + pt5[1]) / 2, + (pt5[3] + pt5[4]) / 2 + ], axis=0) + else: + pt2 = np.stack([ + pt5[0], + pt5[1] + ], axis=0) + return pt2 + + +def parse_pt2_from_pt_x(pts, use_lip=True): + if pts.shape[0] == 101: + pt2 = parse_pt2_from_pt101(pts, use_lip=use_lip) + elif pts.shape[0] == 106: + pt2 = parse_pt2_from_pt106(pts, use_lip=use_lip) + elif pts.shape[0] == 68: + pt2 = parse_pt2_from_pt68(pts, use_lip=use_lip) + elif pts.shape[0] == 5: + pt2 = parse_pt2_from_pt5(pts, use_lip=use_lip) + elif pts.shape[0] == 203: + pt2 = parse_pt2_from_pt203(pts, use_lip=use_lip) + elif pts.shape[0] > 101: + # take the first 101 points + pt2 = parse_pt2_from_pt101(pts[:101], use_lip=use_lip) + else: + raise Exception(f'Unknow shape: {pts.shape}') + + if not use_lip: + # NOTE: to compile with the latter code, need to rotate the pt2 90 degrees clockwise manually + v = pt2[1] - pt2[0] + pt2[1, 0] = pt2[0, 0] - v[1] + pt2[1, 1] = pt2[0, 1] + v[0] + + return pt2 + + +def parse_rect_from_landmark( + pts, + scale=1.5, + need_square=True, + vx_ratio=0, + vy_ratio=0, + use_deg_flag=False, + **kwargs +): + """parsing center, size, angle from 101/68/5/x landmarks + vx_ratio: the offset ratio along the pupil axis x-axis, multiplied by size + vy_ratio: the offset ratio along the pupil axis y-axis, multiplied by size, which is used to contain more forehead area + + judge with pts.shape + """ + pt2 = parse_pt2_from_pt_x(pts, use_lip=kwargs.get('use_lip', True)) + + uy = pt2[1] - pt2[0] + l = np.linalg.norm(uy) + if l <= 1e-3: + uy = np.array([0, 1], dtype=DTYPE) + else: + uy /= l + ux = np.array((uy[1], -uy[0]), dtype=DTYPE) + + # the rotation degree of the x-axis, the clockwise is positive, the counterclockwise is negative (image coordinate system) + # print(uy) + # print(ux) + angle = acos(ux[0]) + if ux[1] < 0: + angle = -angle + + # rotation matrix + M = np.array([ux, uy]) + + # calculate the size which contains the angle degree of the bbox, and the center + center0 = np.mean(pts, axis=0) + rpts = (pts - center0) @ M.T # (M @ P.T).T = P @ M.T + lt_pt = np.min(rpts, axis=0) + rb_pt = np.max(rpts, axis=0) + center1 = (lt_pt + rb_pt) / 2 + + size = rb_pt - lt_pt + if need_square: + m = max(size[0], size[1]) + size[0] = m + size[1] = m + + size *= scale # scale size + center = center0 + ux * center1[0] + uy * center1[1] # counterclockwise rotation, equivalent to M.T @ center1.T + center = center + ux * (vx_ratio * size) + uy * \ + (vy_ratio * size) # considering the offset in vx and vy direction + + if use_deg_flag: + angle = degrees(angle) + + return center, size, angle + + +def parse_bbox_from_landmark(pts, **kwargs): + center, size, angle = parse_rect_from_landmark(pts, **kwargs) + cx, cy = center + w, h = size + + # calculate the vertex positions before rotation + bbox = np.array([ + [cx-w/2, cy-h/2], # left, top + [cx+w/2, cy-h/2], + [cx+w/2, cy+h/2], # right, bottom + [cx-w/2, cy+h/2] + ], dtype=DTYPE) + + # construct rotation matrix + bbox_rot = bbox.copy() + R = np.array([ + [np.cos(angle), -np.sin(angle)], + [np.sin(angle), np.cos(angle)] + ], dtype=DTYPE) + + # calculate the relative position of each vertex from the rotation center, then rotate these positions, and finally add the coordinates of the rotation center + bbox_rot = (bbox_rot - center) @ R.T + center + + return { + 'center': center, # 2x1 + 'size': size, # scalar + 'angle': angle, # rad, counterclockwise + 'bbox': bbox, # 4x2 + 'bbox_rot': bbox_rot, # 4x2 + } + + +def crop_image_by_bbox(img, bbox, lmk=None, dsize=512, angle=None, flag_rot=False, **kwargs): + left, top, right, bot = bbox + if int(right - left) != int(bot - top): + print(f'right-left {right-left} != bot-top {bot-top}') + size = right - left + + src_center = np.array([(left + right) / 2, (top + bot) / 2], dtype=DTYPE) + tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE) + + s = dsize / size # scale + if flag_rot and angle is not None: + costheta, sintheta = cos(angle), sin(angle) + cx, cy = src_center[0], src_center[1] # ori center + tcx, tcy = tgt_center[0], tgt_center[1] # target center + # need to infer + M_o2c = np.array( + [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)], + [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]], + dtype=DTYPE + ) + else: + M_o2c = np.array( + [[s, 0, tgt_center[0] - s * src_center[0]], + [0, s, tgt_center[1] - s * src_center[1]]], + dtype=DTYPE + ) + + if flag_rot and angle is None: + print('angle is None, but flag_rotate is True', style="bold yellow") + + img_crop = _transform_img(img, M_o2c, dsize=dsize, borderMode=kwargs.get('borderMode', None)) + + lmk_crop = _transform_pts(lmk, M_o2c) if lmk is not None else None + + M_o2c = np.vstack([M_o2c, np.array([0, 0, 1], dtype=DTYPE)]) + M_c2o = np.linalg.inv(M_o2c) + + # cv2.imwrite('crop.jpg', img_crop) + + return { + 'img_crop': img_crop, + 'lmk_crop': lmk_crop, + 'M_o2c': M_o2c, + 'M_c2o': M_c2o, + } + + +def _estimate_similar_transform_from_pts( + pts, + dsize, + scale=1.5, + vx_ratio=0, + vy_ratio=-0.1, + flag_do_rot=True, + **kwargs +): + """ calculate the affine matrix of the cropped image from sparse points, the original image to the cropped image, the inverse is the cropped image to the original image + pts: landmark, 101 or 68 points or other points, Nx2 + scale: the larger scale factor, the smaller face ratio + vx_ratio: x shift + vy_ratio: y shift, the smaller the y shift, the lower the face region + rot_flag: if it is true, conduct correction + """ + center, size, angle = parse_rect_from_landmark( + pts, scale=scale, vx_ratio=vx_ratio, vy_ratio=vy_ratio, + use_lip=kwargs.get('use_lip', True) + ) + + s = dsize / size[0] # scale + tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE) # center of dsize + + if flag_do_rot: + costheta, sintheta = cos(angle), sin(angle) + cx, cy = center[0], center[1] # ori center + tcx, tcy = tgt_center[0], tgt_center[1] # target center + # need to infer + M_INV = np.array( + [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)], + [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]], + dtype=DTYPE + ) + else: + M_INV = np.array( + [[s, 0, tgt_center[0] - s * center[0]], + [0, s, tgt_center[1] - s * center[1]]], + dtype=DTYPE + ) + + M_INV_H = np.vstack([M_INV, np.array([0, 0, 1])]) + M = np.linalg.inv(M_INV_H) + + # M_INV is from the original image to the cropped image, M is from the cropped image to the original image + return M_INV, M[:2, ...] + + +def crop_image(img, pts: np.ndarray, **kwargs): + dsize = kwargs.get('dsize', 224) + scale = kwargs.get('scale', 1.5) # 1.5 | 1.6 + vy_ratio = kwargs.get('vy_ratio', -0.1) # -0.0625 | -0.1 + + M_INV, _ = _estimate_similar_transform_from_pts( + pts, + dsize=dsize, + scale=scale, + vy_ratio=vy_ratio, + flag_do_rot=kwargs.get('flag_do_rot', True), + ) + + if img is None: + M_INV_H = np.vstack([M_INV, np.array([0, 0, 1], dtype=DTYPE)]) + M = np.linalg.inv(M_INV_H) + ret_dct = { + 'M': M[:2, ...], # from the original image to the cropped image + 'M_o2c': M[:2, ...], # from the cropped image to the original image + 'img_crop': None, + 'pt_crop': None, + } + return ret_dct + + img_crop = _transform_img(img, M_INV, dsize) # origin to crop + pt_crop = _transform_pts(pts, M_INV) + + M_o2c = np.vstack([M_INV, np.array([0, 0, 1], dtype=DTYPE)]) + M_c2o = np.linalg.inv(M_o2c) + + ret_dct = { + 'M_o2c': M_o2c, # from the original image to the cropped image 3x3 + 'M_c2o': M_c2o, # from the cropped image to the original image 3x3 + 'img_crop': img_crop, # the cropped image + 'pt_crop': pt_crop, # the landmarks of the cropped image + } + + return ret_dct + +def average_bbox_lst(bbox_lst): + if len(bbox_lst) == 0: + return None + bbox_arr = np.array(bbox_lst) + return np.mean(bbox_arr, axis=0).tolist() + +def prepare_paste_back(mask_crop, crop_M_c2o, dsize): + """prepare mask for later image paste back + """ + if mask_crop is None: + mask_crop = cv2.imread(make_abs_path('./resources/mask_template.png'), cv2.IMREAD_COLOR) + mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize) + mask_ori = mask_ori.astype(np.float32) / 255. + return mask_ori + +def paste_back(image_to_processed, crop_M_c2o, rgb_ori, mask_ori): + """paste back the image + """ + dsize = (rgb_ori.shape[1], rgb_ori.shape[0]) + result = _transform_img(image_to_processed, crop_M_c2o, dsize=dsize) + result = np.clip(mask_ori * result + (1 - mask_ori) * rgb_ori, 0, 255).astype(np.uint8) + return result \ No newline at end of file diff --git a/src/utils/cropper.py b/src/utils/cropper.py new file mode 100644 index 0000000000000000000000000000000000000000..d5d511c93a844b02e68df53e4c81fa0f25fee3da --- /dev/null +++ b/src/utils/cropper.py @@ -0,0 +1,145 @@ +# coding: utf-8 + +import gradio as gr +import numpy as np +import os.path as osp +from typing import List, Union, Tuple +from dataclasses import dataclass, field +import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False) + +from .landmark_runner import LandmarkRunner +from .face_analysis_diy import FaceAnalysisDIY +from .helper import prefix +from .crop import crop_image, crop_image_by_bbox, parse_bbox_from_landmark, average_bbox_lst +from .timer import Timer +from .rprint import rlog as log +from .io import load_image_rgb +from .video import VideoWriter, get_fps, change_video_fps + + +def make_abs_path(fn): + return osp.join(osp.dirname(osp.realpath(__file__)), fn) + + +@dataclass +class Trajectory: + start: int = -1 # ่ตทๅง‹ๅธง ้—ญๅŒบ้—ด + end: int = -1 # ็ป“ๆŸๅธง ้—ญๅŒบ้—ด + lmk_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # lmk list + bbox_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # bbox list + frame_rgb_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # frame list + frame_rgb_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # frame crop list + + +class Cropper(object): + def __init__(self, **kwargs) -> None: + device_id = kwargs.get('device_id', 0) + self.landmark_runner = LandmarkRunner( + ckpt_path=make_abs_path('../../pretrained_weights/liveportrait/landmark.onnx'), + onnx_provider='cuda', + device_id=device_id + ) + self.landmark_runner.warmup() + + self.face_analysis_wrapper = FaceAnalysisDIY( + name='buffalo_l', + root=make_abs_path('../../pretrained_weights/insightface'), + providers=["CUDAExecutionProvider"] + ) + self.face_analysis_wrapper.prepare(ctx_id=device_id, det_size=(512, 512)) + self.face_analysis_wrapper.warmup() + + self.crop_cfg = kwargs.get('crop_cfg', None) + + def update_config(self, user_args): + for k, v in user_args.items(): + if hasattr(self.crop_cfg, k): + setattr(self.crop_cfg, k, v) + + def crop_single_image(self, obj, **kwargs): + direction = kwargs.get('direction', 'large-small') + + # crop and align a single image + if isinstance(obj, str): + img_rgb = load_image_rgb(obj) + elif isinstance(obj, np.ndarray): + img_rgb = obj + + src_face = self.face_analysis_wrapper.get( + img_rgb, + flag_do_landmark_2d_106=True, + direction=direction + ) + + if len(src_face) == 0: + log('No face detected in the source image.') + raise gr.Error("No face detected in the source image ๐Ÿ’ฅ!", duration=5) + raise Exception("No face detected in the source image!") + elif len(src_face) > 1: + log(f'More than one face detected in the image, only pick one face by rule {direction}.') + + src_face = src_face[0] + pts = src_face.landmark_2d_106 + + # crop the face + ret_dct = crop_image( + img_rgb, # ndarray + pts, # 106x2 or Nx2 + dsize=kwargs.get('dsize', 512), + scale=kwargs.get('scale', 2.3), + vy_ratio=kwargs.get('vy_ratio', -0.15), + ) + # update a 256x256 version for network input or else + ret_dct['img_crop_256x256'] = cv2.resize(ret_dct['img_crop'], (256, 256), interpolation=cv2.INTER_AREA) + ret_dct['pt_crop_256x256'] = ret_dct['pt_crop'] * 256 / kwargs.get('dsize', 512) + + recon_ret = self.landmark_runner.run(img_rgb, pts) + lmk = recon_ret['pts'] + ret_dct['lmk_crop'] = lmk + + return ret_dct + + def get_retargeting_lmk_info(self, driving_rgb_lst): + # TODO: implement a tracking-based version + driving_lmk_lst = [] + for driving_image in driving_rgb_lst: + ret_dct = self.crop_single_image(driving_image) + driving_lmk_lst.append(ret_dct['lmk_crop']) + return driving_lmk_lst + + def make_video_clip(self, driving_rgb_lst, output_path, output_fps=30, **kwargs): + trajectory = Trajectory() + direction = kwargs.get('direction', 'large-small') + for idx, driving_image in enumerate(driving_rgb_lst): + if idx == 0 or trajectory.start == -1: + src_face = self.face_analysis_wrapper.get( + driving_image, + flag_do_landmark_2d_106=True, + direction=direction + ) + if len(src_face) == 0: + # No face detected in the driving_image + continue + elif len(src_face) > 1: + log(f'More than one face detected in the driving frame_{idx}, only pick one face by rule {direction}.') + src_face = src_face[0] + pts = src_face.landmark_2d_106 + lmk_203 = self.landmark_runner(driving_image, pts)['pts'] + trajectory.start, trajectory.end = idx, idx + else: + lmk_203 = self.face_recon_wrapper(driving_image, trajectory.lmk_lst[-1])['pts'] + trajectory.end = idx + + trajectory.lmk_lst.append(lmk_203) + ret_bbox = parse_bbox_from_landmark(lmk_203, scale=self.crop_cfg.globalscale, vy_ratio=elf.crop_cfg.vy_ratio)['bbox'] + bbox = [ret_bbox[0, 0], ret_bbox[0, 1], ret_bbox[2, 0], ret_bbox[2, 1]] # 4, + trajectory.bbox_lst.append(bbox) # bbox + trajectory.frame_rgb_lst.append(driving_image) + + global_bbox = average_bbox_lst(trajectory.bbox_lst) + for idx, (frame_rgb, lmk) in enumerate(zip(trajectory.frame_rgb_lst, trajectory.lmk_lst)): + ret_dct = crop_image_by_bbox( + frame_rgb, global_bbox, lmk=lmk, + dsize=self.video_crop_cfg.dsize, flag_rot=self.video_crop_cfg.flag_rot, borderValue=self.video_crop_cfg.borderValue + ) + frame_rgb_crop = ret_dct['img_crop'] diff --git a/src/utils/dependencies/insightface/__init__.py b/src/utils/dependencies/insightface/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1680083da47850b31da10803c7d255e67dda619a --- /dev/null +++ b/src/utils/dependencies/insightface/__init__.py @@ -0,0 +1,20 @@ +# coding: utf-8 +# pylint: disable=wrong-import-position +"""InsightFace: A Face Analysis Toolkit.""" +from __future__ import absolute_import + +try: + #import mxnet as mx + import onnxruntime +except ImportError: + raise ImportError( + "Unable to import dependency onnxruntime. " + ) + +__version__ = '0.7.3' + +from . import model_zoo +from . import utils +from . import app +from . import data + diff --git a/src/utils/dependencies/insightface/app/__init__.py b/src/utils/dependencies/insightface/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cc574616885290489798bac5c682e7aaa65a5dad --- /dev/null +++ b/src/utils/dependencies/insightface/app/__init__.py @@ -0,0 +1 @@ +from .face_analysis import * diff --git a/src/utils/dependencies/insightface/app/common.py b/src/utils/dependencies/insightface/app/common.py new file mode 100644 index 0000000000000000000000000000000000000000..82ca987aeede35510b3aef72b4edf2390ad84e65 --- /dev/null +++ b/src/utils/dependencies/insightface/app/common.py @@ -0,0 +1,49 @@ +import numpy as np +from numpy.linalg import norm as l2norm +#from easydict import EasyDict + +class Face(dict): + + def __init__(self, d=None, **kwargs): + if d is None: + d = {} + if kwargs: + d.update(**kwargs) + for k, v in d.items(): + setattr(self, k, v) + # Class attributes + #for k in self.__class__.__dict__.keys(): + # if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'): + # setattr(self, k, getattr(self, k)) + + def __setattr__(self, name, value): + if isinstance(value, (list, tuple)): + value = [self.__class__(x) + if isinstance(x, dict) else x for x in value] + elif isinstance(value, dict) and not isinstance(value, self.__class__): + value = self.__class__(value) + super(Face, self).__setattr__(name, value) + super(Face, self).__setitem__(name, value) + + __setitem__ = __setattr__ + + def __getattr__(self, name): + return None + + @property + def embedding_norm(self): + if self.embedding is None: + return None + return l2norm(self.embedding) + + @property + def normed_embedding(self): + if self.embedding is None: + return None + return self.embedding / self.embedding_norm + + @property + def sex(self): + if self.gender is None: + return None + return 'M' if self.gender==1 else 'F' diff --git a/src/utils/dependencies/insightface/app/face_analysis.py b/src/utils/dependencies/insightface/app/face_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..aa5128b3f5e02c2c19e7df195cc1c1e7fcf36c4d --- /dev/null +++ b/src/utils/dependencies/insightface/app/face_analysis.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- +# @Organization : insightface.ai +# @Author : Jia Guo +# @Time : 2021-05-04 +# @Function : + + +from __future__ import division + +import glob +import os.path as osp + +import numpy as np +import onnxruntime +from numpy.linalg import norm + +from ..model_zoo import model_zoo +from ..utils import ensure_available +from .common import Face + + +DEFAULT_MP_NAME = 'buffalo_l' +__all__ = ['FaceAnalysis'] + +class FaceAnalysis: + def __init__(self, name=DEFAULT_MP_NAME, root='~/.insightface', allowed_modules=None, **kwargs): + onnxruntime.set_default_logger_severity(3) + self.models = {} + self.model_dir = ensure_available('models', name, root=root) + onnx_files = glob.glob(osp.join(self.model_dir, '*.onnx')) + onnx_files = sorted(onnx_files) + for onnx_file in onnx_files: + model = model_zoo.get_model(onnx_file, **kwargs) + if model is None: + print('model not recognized:', onnx_file) + elif allowed_modules is not None and model.taskname not in allowed_modules: + print('model ignore:', onnx_file, model.taskname) + del model + elif model.taskname not in self.models and (allowed_modules is None or model.taskname in allowed_modules): + # print('find model:', onnx_file, model.taskname, model.input_shape, model.input_mean, model.input_std) + self.models[model.taskname] = model + else: + print('duplicated model task type, ignore:', onnx_file, model.taskname) + del model + assert 'detection' in self.models + self.det_model = self.models['detection'] + + + def prepare(self, ctx_id, det_thresh=0.5, det_size=(640, 640)): + self.det_thresh = det_thresh + assert det_size is not None + # print('set det-size:', det_size) + self.det_size = det_size + for taskname, model in self.models.items(): + if taskname=='detection': + model.prepare(ctx_id, input_size=det_size, det_thresh=det_thresh) + else: + model.prepare(ctx_id) + + def get(self, img, max_num=0): + bboxes, kpss = self.det_model.detect(img, + max_num=max_num, + metric='default') + if bboxes.shape[0] == 0: + return [] + ret = [] + for i in range(bboxes.shape[0]): + bbox = bboxes[i, 0:4] + det_score = bboxes[i, 4] + kps = None + if kpss is not None: + kps = kpss[i] + face = Face(bbox=bbox, kps=kps, det_score=det_score) + for taskname, model in self.models.items(): + if taskname=='detection': + continue + model.get(img, face) + ret.append(face) + return ret + + def draw_on(self, img, faces): + import cv2 + dimg = img.copy() + for i in range(len(faces)): + face = faces[i] + box = face.bbox.astype(np.int) + color = (0, 0, 255) + cv2.rectangle(dimg, (box[0], box[1]), (box[2], box[3]), color, 2) + if face.kps is not None: + kps = face.kps.astype(np.int) + #print(landmark.shape) + for l in range(kps.shape[0]): + color = (0, 0, 255) + if l == 0 or l == 3: + color = (0, 255, 0) + cv2.circle(dimg, (kps[l][0], kps[l][1]), 1, color, + 2) + if face.gender is not None and face.age is not None: + cv2.putText(dimg,'%s,%d'%(face.sex,face.age), (box[0]-1, box[1]-4),cv2.FONT_HERSHEY_COMPLEX,0.7,(0,255,0),1) + + #for key, value in face.items(): + # if key.startswith('landmark_3d'): + # print(key, value.shape) + # print(value[0:10,:]) + # lmk = np.round(value).astype(np.int) + # for l in range(lmk.shape[0]): + # color = (255, 0, 0) + # cv2.circle(dimg, (lmk[l][0], lmk[l][1]), 1, color, + # 2) + return dimg diff --git a/src/utils/dependencies/insightface/data/__init__.py b/src/utils/dependencies/insightface/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..665c59ec99b6ebf12822015e0350969c7903e243 --- /dev/null +++ b/src/utils/dependencies/insightface/data/__init__.py @@ -0,0 +1,2 @@ +from .image import get_image +from .pickle_object import get_object diff --git a/src/utils/dependencies/insightface/data/image.py b/src/utils/dependencies/insightface/data/image.py new file mode 100644 index 0000000000000000000000000000000000000000..6d32c4bcb1b13d33bcb0d840cf7b8c08d183b3ea --- /dev/null +++ b/src/utils/dependencies/insightface/data/image.py @@ -0,0 +1,27 @@ +import cv2 +import os +import os.path as osp +from pathlib import Path + +class ImageCache: + data = {} + +def get_image(name, to_rgb=False): + key = (name, to_rgb) + if key in ImageCache.data: + return ImageCache.data[key] + images_dir = osp.join(Path(__file__).parent.absolute(), 'images') + ext_names = ['.jpg', '.png', '.jpeg'] + image_file = None + for ext_name in ext_names: + _image_file = osp.join(images_dir, "%s%s"%(name, ext_name)) + if osp.exists(_image_file): + image_file = _image_file + break + assert image_file is not None, '%s not found'%name + img = cv2.imread(image_file) + if to_rgb: + img = img[:,:,::-1] + ImageCache.data[key] = img + return img + diff --git a/src/utils/dependencies/insightface/data/images/Tom_Hanks_54745.png b/src/utils/dependencies/insightface/data/images/Tom_Hanks_54745.png new file mode 100644 index 0000000000000000000000000000000000000000..906315d13fa29bb3a5ded3e162592f2c7f041b23 Binary files /dev/null and b/src/utils/dependencies/insightface/data/images/Tom_Hanks_54745.png differ diff --git a/src/utils/dependencies/insightface/data/images/mask_black.jpg b/src/utils/dependencies/insightface/data/images/mask_black.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0eab0df555c23f1e033537fe39f3c0c8303dd369 Binary files /dev/null and b/src/utils/dependencies/insightface/data/images/mask_black.jpg differ diff --git a/src/utils/dependencies/insightface/data/images/mask_blue.jpg b/src/utils/dependencies/insightface/data/images/mask_blue.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f71336b9a0d3038ebd84e6995ebfbe54946fcbb4 Binary files /dev/null and b/src/utils/dependencies/insightface/data/images/mask_blue.jpg differ diff --git a/src/utils/dependencies/insightface/data/images/mask_green.jpg b/src/utils/dependencies/insightface/data/images/mask_green.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ac2ad55f4fc580c915dfa4c157ca3bfc84e453f4 Binary files /dev/null and b/src/utils/dependencies/insightface/data/images/mask_green.jpg differ diff --git a/src/utils/dependencies/insightface/data/images/mask_white.jpg b/src/utils/dependencies/insightface/data/images/mask_white.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2148ab2d09fdee6e3f59315470e98ecfc54339e4 Binary files /dev/null and b/src/utils/dependencies/insightface/data/images/mask_white.jpg differ diff --git a/src/utils/dependencies/insightface/data/images/t1.jpg b/src/utils/dependencies/insightface/data/images/t1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0d1d64a59675c9590fd12429db647eb169cecff8 Binary files /dev/null and b/src/utils/dependencies/insightface/data/images/t1.jpg differ diff --git a/src/utils/dependencies/insightface/data/objects/meanshape_68.pkl b/src/utils/dependencies/insightface/data/objects/meanshape_68.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d5297e9e8ea5574298ddd287b058252e03aa18c1 --- /dev/null +++ b/src/utils/dependencies/insightface/data/objects/meanshape_68.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39ffecf84ba73f0d0d7e49380833ba88713c9fcdec51df4f7ac45a48b8f4cc51 +size 974 diff --git a/src/utils/dependencies/insightface/data/pickle_object.py b/src/utils/dependencies/insightface/data/pickle_object.py new file mode 100644 index 0000000000000000000000000000000000000000..fbd87030ea15e1d01af1cd4cff1be2bc54cc82dd --- /dev/null +++ b/src/utils/dependencies/insightface/data/pickle_object.py @@ -0,0 +1,17 @@ +import cv2 +import os +import os.path as osp +from pathlib import Path +import pickle + +def get_object(name): + objects_dir = osp.join(Path(__file__).parent.absolute(), 'objects') + if not name.endswith('.pkl'): + name = name+".pkl" + filepath = osp.join(objects_dir, name) + if not osp.exists(filepath): + return None + with open(filepath, 'rb') as f: + obj = pickle.load(f) + return obj + diff --git a/src/utils/dependencies/insightface/data/rec_builder.py b/src/utils/dependencies/insightface/data/rec_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..e02abc969da2f882639326f5bad3c7e8d08c1fde --- /dev/null +++ b/src/utils/dependencies/insightface/data/rec_builder.py @@ -0,0 +1,71 @@ +import pickle +import numpy as np +import os +import os.path as osp +import sys +import mxnet as mx + + +class RecBuilder(): + def __init__(self, path, image_size=(112, 112)): + self.path = path + self.image_size = image_size + self.widx = 0 + self.wlabel = 0 + self.max_label = -1 + assert not osp.exists(path), '%s exists' % path + os.makedirs(path) + self.writer = mx.recordio.MXIndexedRecordIO(os.path.join(path, 'train.idx'), + os.path.join(path, 'train.rec'), + 'w') + self.meta = [] + + def add(self, imgs): + #!!! img should be BGR!!!! + #assert label >= 0 + #assert label > self.last_label + assert len(imgs) > 0 + label = self.wlabel + for img in imgs: + idx = self.widx + image_meta = {'image_index': idx, 'image_classes': [label]} + header = mx.recordio.IRHeader(0, label, idx, 0) + if isinstance(img, np.ndarray): + s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg') + else: + s = mx.recordio.pack(header, img) + self.writer.write_idx(idx, s) + self.meta.append(image_meta) + self.widx += 1 + self.max_label = label + self.wlabel += 1 + + + def add_image(self, img, label): + #!!! img should be BGR!!!! + #assert label >= 0 + #assert label > self.last_label + idx = self.widx + header = mx.recordio.IRHeader(0, label, idx, 0) + if isinstance(label, list): + idlabel = label[0] + else: + idlabel = label + image_meta = {'image_index': idx, 'image_classes': [idlabel]} + if isinstance(img, np.ndarray): + s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg') + else: + s = mx.recordio.pack(header, img) + self.writer.write_idx(idx, s) + self.meta.append(image_meta) + self.widx += 1 + self.max_label = max(self.max_label, idlabel) + + def close(self): + with open(osp.join(self.path, 'train.meta'), 'wb') as pfile: + pickle.dump(self.meta, pfile, protocol=pickle.HIGHEST_PROTOCOL) + print('stat:', self.widx, self.wlabel) + with open(os.path.join(self.path, 'property'), 'w') as f: + f.write("%d,%d,%d\n" % (self.max_label+1, self.image_size[0], self.image_size[1])) + f.write("%d\n" % (self.widx)) + diff --git a/src/utils/dependencies/insightface/model_zoo/__init__.py b/src/utils/dependencies/insightface/model_zoo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..225623d6142c968b4040f391039bfab88bdd1b2a --- /dev/null +++ b/src/utils/dependencies/insightface/model_zoo/__init__.py @@ -0,0 +1,6 @@ +from .model_zoo import get_model +from .arcface_onnx import ArcFaceONNX +from .retinaface import RetinaFace +from .scrfd import SCRFD +from .landmark import Landmark +from .attribute import Attribute diff --git a/src/utils/dependencies/insightface/model_zoo/arcface_onnx.py b/src/utils/dependencies/insightface/model_zoo/arcface_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..b537ce2ee15d4a1834d54e185f34e336aab30a77 --- /dev/null +++ b/src/utils/dependencies/insightface/model_zoo/arcface_onnx.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# @Organization : insightface.ai +# @Author : Jia Guo +# @Time : 2021-05-04 +# @Function : + +from __future__ import division +import numpy as np +import cv2 +import onnx +import onnxruntime +from ..utils import face_align + +__all__ = [ + 'ArcFaceONNX', +] + + +class ArcFaceONNX: + def __init__(self, model_file=None, session=None): + assert model_file is not None + self.model_file = model_file + self.session = session + self.taskname = 'recognition' + find_sub = False + find_mul = False + model = onnx.load(self.model_file) + graph = model.graph + for nid, node in enumerate(graph.node[:8]): + #print(nid, node.name) + if node.name.startswith('Sub') or node.name.startswith('_minus'): + find_sub = True + if node.name.startswith('Mul') or node.name.startswith('_mul'): + find_mul = True + if find_sub and find_mul: + #mxnet arcface model + input_mean = 0.0 + input_std = 1.0 + else: + input_mean = 127.5 + input_std = 127.5 + self.input_mean = input_mean + self.input_std = input_std + #print('input mean and std:', self.input_mean, self.input_std) + if self.session is None: + self.session = onnxruntime.InferenceSession(self.model_file, None) + input_cfg = self.session.get_inputs()[0] + input_shape = input_cfg.shape + input_name = input_cfg.name + self.input_size = tuple(input_shape[2:4][::-1]) + self.input_shape = input_shape + outputs = self.session.get_outputs() + output_names = [] + for out in outputs: + output_names.append(out.name) + self.input_name = input_name + self.output_names = output_names + assert len(self.output_names)==1 + self.output_shape = outputs[0].shape + + def prepare(self, ctx_id, **kwargs): + if ctx_id<0: + self.session.set_providers(['CPUExecutionProvider']) + + def get(self, img, face): + aimg = face_align.norm_crop(img, landmark=face.kps, image_size=self.input_size[0]) + face.embedding = self.get_feat(aimg).flatten() + return face.embedding + + def compute_sim(self, feat1, feat2): + from numpy.linalg import norm + feat1 = feat1.ravel() + feat2 = feat2.ravel() + sim = np.dot(feat1, feat2) / (norm(feat1) * norm(feat2)) + return sim + + def get_feat(self, imgs): + if not isinstance(imgs, list): + imgs = [imgs] + input_size = self.input_size + + blob = cv2.dnn.blobFromImages(imgs, 1.0 / self.input_std, input_size, + (self.input_mean, self.input_mean, self.input_mean), swapRB=True) + net_out = self.session.run(self.output_names, {self.input_name: blob})[0] + return net_out + + def forward(self, batch_data): + blob = (batch_data - self.input_mean) / self.input_std + net_out = self.session.run(self.output_names, {self.input_name: blob})[0] + return net_out + + diff --git a/src/utils/dependencies/insightface/model_zoo/attribute.py b/src/utils/dependencies/insightface/model_zoo/attribute.py new file mode 100644 index 0000000000000000000000000000000000000000..40c34de3f0995499448cf5779004cc1e5f3564fb --- /dev/null +++ b/src/utils/dependencies/insightface/model_zoo/attribute.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- +# @Organization : insightface.ai +# @Author : Jia Guo +# @Time : 2021-06-19 +# @Function : + +from __future__ import division +import numpy as np +import cv2 +import onnx +import onnxruntime +from ..utils import face_align + +__all__ = [ + 'Attribute', +] + + +class Attribute: + def __init__(self, model_file=None, session=None): + assert model_file is not None + self.model_file = model_file + self.session = session + find_sub = False + find_mul = False + model = onnx.load(self.model_file) + graph = model.graph + for nid, node in enumerate(graph.node[:8]): + #print(nid, node.name) + if node.name.startswith('Sub') or node.name.startswith('_minus'): + find_sub = True + if node.name.startswith('Mul') or node.name.startswith('_mul'): + find_mul = True + if nid<3 and node.name=='bn_data': + find_sub = True + find_mul = True + if find_sub and find_mul: + #mxnet arcface model + input_mean = 0.0 + input_std = 1.0 + else: + input_mean = 127.5 + input_std = 128.0 + self.input_mean = input_mean + self.input_std = input_std + #print('input mean and std:', model_file, self.input_mean, self.input_std) + if self.session is None: + self.session = onnxruntime.InferenceSession(self.model_file, None) + input_cfg = self.session.get_inputs()[0] + input_shape = input_cfg.shape + input_name = input_cfg.name + self.input_size = tuple(input_shape[2:4][::-1]) + self.input_shape = input_shape + outputs = self.session.get_outputs() + output_names = [] + for out in outputs: + output_names.append(out.name) + self.input_name = input_name + self.output_names = output_names + assert len(self.output_names)==1 + output_shape = outputs[0].shape + #print('init output_shape:', output_shape) + if output_shape[1]==3: + self.taskname = 'genderage' + else: + self.taskname = 'attribute_%d'%output_shape[1] + + def prepare(self, ctx_id, **kwargs): + if ctx_id<0: + self.session.set_providers(['CPUExecutionProvider']) + + def get(self, img, face): + bbox = face.bbox + w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1]) + center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2 + rotate = 0 + _scale = self.input_size[0] / (max(w, h)*1.5) + #print('param:', img.shape, bbox, center, self.input_size, _scale, rotate) + aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate) + input_size = tuple(aimg.shape[0:2][::-1]) + #assert input_size==self.input_size + blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True) + pred = self.session.run(self.output_names, {self.input_name : blob})[0][0] + if self.taskname=='genderage': + assert len(pred)==3 + gender = np.argmax(pred[:2]) + age = int(np.round(pred[2]*100)) + face['gender'] = gender + face['age'] = age + return gender, age + else: + return pred + + diff --git a/src/utils/dependencies/insightface/model_zoo/inswapper.py b/src/utils/dependencies/insightface/model_zoo/inswapper.py new file mode 100644 index 0000000000000000000000000000000000000000..f321c627ee66cceddcab98b561b997441dd4f768 --- /dev/null +++ b/src/utils/dependencies/insightface/model_zoo/inswapper.py @@ -0,0 +1,114 @@ +import time +import numpy as np +import onnxruntime +import cv2 +import onnx +from onnx import numpy_helper +from ..utils import face_align + + + + +class INSwapper(): + def __init__(self, model_file=None, session=None): + self.model_file = model_file + self.session = session + model = onnx.load(self.model_file) + graph = model.graph + self.emap = numpy_helper.to_array(graph.initializer[-1]) + self.input_mean = 0.0 + self.input_std = 255.0 + #print('input mean and std:', model_file, self.input_mean, self.input_std) + if self.session is None: + self.session = onnxruntime.InferenceSession(self.model_file, None) + inputs = self.session.get_inputs() + self.input_names = [] + for inp in inputs: + self.input_names.append(inp.name) + outputs = self.session.get_outputs() + output_names = [] + for out in outputs: + output_names.append(out.name) + self.output_names = output_names + assert len(self.output_names)==1 + output_shape = outputs[0].shape + input_cfg = inputs[0] + input_shape = input_cfg.shape + self.input_shape = input_shape + # print('inswapper-shape:', self.input_shape) + self.input_size = tuple(input_shape[2:4][::-1]) + + def forward(self, img, latent): + img = (img - self.input_mean) / self.input_std + pred = self.session.run(self.output_names, {self.input_names[0]: img, self.input_names[1]: latent})[0] + return pred + + def get(self, img, target_face, source_face, paste_back=True): + face_mask = np.zeros((img.shape[0], img.shape[1]), np.uint8) + cv2.fillPoly(face_mask, np.array([target_face.landmark_2d_106[[1,9,10,11,12,13,14,15,16,2,3,4,5,6,7,8,0,24,23,22,21,20,19,18,32,31,30,29,28,27,26,25,17,101,105,104,103,51,49,48,43]].astype('int64')]), 1) + aimg, M = face_align.norm_crop2(img, target_face.kps, self.input_size[0]) + blob = cv2.dnn.blobFromImage(aimg, 1.0 / self.input_std, self.input_size, + (self.input_mean, self.input_mean, self.input_mean), swapRB=True) + latent = source_face.normed_embedding.reshape((1,-1)) + latent = np.dot(latent, self.emap) + latent /= np.linalg.norm(latent) + pred = self.session.run(self.output_names, {self.input_names[0]: blob, self.input_names[1]: latent})[0] + #print(latent.shape, latent.dtype, pred.shape) + img_fake = pred.transpose((0,2,3,1))[0] + bgr_fake = np.clip(255 * img_fake, 0, 255).astype(np.uint8)[:,:,::-1] + if not paste_back: + return bgr_fake, M + else: + target_img = img + fake_diff = bgr_fake.astype(np.float32) - aimg.astype(np.float32) + fake_diff = np.abs(fake_diff).mean(axis=2) + fake_diff[:2,:] = 0 + fake_diff[-2:,:] = 0 + fake_diff[:,:2] = 0 + fake_diff[:,-2:] = 0 + IM = cv2.invertAffineTransform(M) + img_white = np.full((aimg.shape[0],aimg.shape[1]), 255, dtype=np.float32) + bgr_fake = cv2.warpAffine(bgr_fake, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0) + img_white = cv2.warpAffine(img_white, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0) + fake_diff = cv2.warpAffine(fake_diff, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0) + img_white[img_white>20] = 255 + fthresh = 10 + fake_diff[fake_diff=fthresh] = 255 + img_mask = img_white + mask_h_inds, mask_w_inds = np.where(img_mask==255) + mask_h = np.max(mask_h_inds) - np.min(mask_h_inds) + mask_w = np.max(mask_w_inds) - np.min(mask_w_inds) + mask_size = int(np.sqrt(mask_h*mask_w)) + k = max(mask_size//10, 10) + #k = max(mask_size//20, 6) + #k = 6 + kernel = np.ones((k,k),np.uint8) + img_mask = cv2.erode(img_mask,kernel,iterations = 1) + kernel = np.ones((2,2),np.uint8) + fake_diff = cv2.dilate(fake_diff,kernel,iterations = 1) + + face_mask = cv2.erode(face_mask,np.ones((11,11),np.uint8),iterations = 1) + fake_diff[face_mask==1] = 255 + + k = max(mask_size//20, 5) + #k = 3 + #k = 3 + kernel_size = (k, k) + blur_size = tuple(2*i+1 for i in kernel_size) + img_mask = cv2.GaussianBlur(img_mask, blur_size, 0) + k = 5 + kernel_size = (k, k) + blur_size = tuple(2*i+1 for i in kernel_size) + fake_diff = cv2.blur(fake_diff, (11,11), 0) + ##fake_diff = cv2.GaussianBlur(fake_diff, blur_size, 0) + # print('blur_size: ', blur_size) + # fake_diff = cv2.blur(fake_diff, (21, 21), 0) # blur_size + img_mask /= 255 + fake_diff /= 255 + # img_mask = fake_diff + img_mask = img_mask*fake_diff + img_mask = np.reshape(img_mask, [img_mask.shape[0],img_mask.shape[1],1]) + fake_merged = img_mask * bgr_fake + (1-img_mask) * target_img.astype(np.float32) + fake_merged = fake_merged.astype(np.uint8) + return fake_merged diff --git a/src/utils/dependencies/insightface/model_zoo/landmark.py b/src/utils/dependencies/insightface/model_zoo/landmark.py new file mode 100644 index 0000000000000000000000000000000000000000..598b4b29a2d0674d8bb25b681f921c61460d101c --- /dev/null +++ b/src/utils/dependencies/insightface/model_zoo/landmark.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +# @Organization : insightface.ai +# @Author : Jia Guo +# @Time : 2021-05-04 +# @Function : + +from __future__ import division +import numpy as np +import cv2 +import onnx +import onnxruntime +from ..utils import face_align +from ..utils import transform +from ..data import get_object + +__all__ = [ + 'Landmark', +] + + +class Landmark: + def __init__(self, model_file=None, session=None): + assert model_file is not None + self.model_file = model_file + self.session = session + find_sub = False + find_mul = False + model = onnx.load(self.model_file) + graph = model.graph + for nid, node in enumerate(graph.node[:8]): + #print(nid, node.name) + if node.name.startswith('Sub') or node.name.startswith('_minus'): + find_sub = True + if node.name.startswith('Mul') or node.name.startswith('_mul'): + find_mul = True + if nid<3 and node.name=='bn_data': + find_sub = True + find_mul = True + if find_sub and find_mul: + #mxnet arcface model + input_mean = 0.0 + input_std = 1.0 + else: + input_mean = 127.5 + input_std = 128.0 + self.input_mean = input_mean + self.input_std = input_std + #print('input mean and std:', model_file, self.input_mean, self.input_std) + if self.session is None: + self.session = onnxruntime.InferenceSession(self.model_file, None) + input_cfg = self.session.get_inputs()[0] + input_shape = input_cfg.shape + input_name = input_cfg.name + self.input_size = tuple(input_shape[2:4][::-1]) + self.input_shape = input_shape + outputs = self.session.get_outputs() + output_names = [] + for out in outputs: + output_names.append(out.name) + self.input_name = input_name + self.output_names = output_names + assert len(self.output_names)==1 + output_shape = outputs[0].shape + self.require_pose = False + #print('init output_shape:', output_shape) + if output_shape[1]==3309: + self.lmk_dim = 3 + self.lmk_num = 68 + self.mean_lmk = get_object('meanshape_68.pkl') + self.require_pose = True + else: + self.lmk_dim = 2 + self.lmk_num = output_shape[1]//self.lmk_dim + self.taskname = 'landmark_%dd_%d'%(self.lmk_dim, self.lmk_num) + + def prepare(self, ctx_id, **kwargs): + if ctx_id<0: + self.session.set_providers(['CPUExecutionProvider']) + + def get(self, img, face): + bbox = face.bbox + w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1]) + center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2 + rotate = 0 + _scale = self.input_size[0] / (max(w, h)*1.5) + #print('param:', img.shape, bbox, center, self.input_size, _scale, rotate) + aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate) + input_size = tuple(aimg.shape[0:2][::-1]) + #assert input_size==self.input_size + blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True) + pred = self.session.run(self.output_names, {self.input_name : blob})[0][0] + if pred.shape[0] >= 3000: + pred = pred.reshape((-1, 3)) + else: + pred = pred.reshape((-1, 2)) + if self.lmk_num < pred.shape[0]: + pred = pred[self.lmk_num*-1:,:] + pred[:, 0:2] += 1 + pred[:, 0:2] *= (self.input_size[0] // 2) + if pred.shape[1] == 3: + pred[:, 2] *= (self.input_size[0] // 2) + + IM = cv2.invertAffineTransform(M) + pred = face_align.trans_points(pred, IM) + face[self.taskname] = pred + if self.require_pose: + P = transform.estimate_affine_matrix_3d23d(self.mean_lmk, pred) + s, R, t = transform.P2sRt(P) + rx, ry, rz = transform.matrix2angle(R) + pose = np.array( [rx, ry, rz], dtype=np.float32 ) + face['pose'] = pose #pitch, yaw, roll + return pred + + diff --git a/src/utils/dependencies/insightface/model_zoo/model_store.py b/src/utils/dependencies/insightface/model_zoo/model_store.py new file mode 100644 index 0000000000000000000000000000000000000000..50bb85d314f5b7a0ea8211d2cd21186e32791592 --- /dev/null +++ b/src/utils/dependencies/insightface/model_zoo/model_store.py @@ -0,0 +1,103 @@ +""" +This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/model_store.py +""" +from __future__ import print_function + +__all__ = ['get_model_file'] +import os +import zipfile +import glob + +from ..utils import download, check_sha1 + +_model_sha1 = { + name: checksum + for checksum, name in [ + ('95be21b58e29e9c1237f229dae534bd854009ce0', 'arcface_r100_v1'), + ('', 'arcface_mfn_v1'), + ('39fd1e087a2a2ed70a154ac01fecaa86c315d01b', 'retinaface_r50_v1'), + ('2c9de8116d1f448fd1d4661f90308faae34c990a', 'retinaface_mnet025_v1'), + ('0db1d07921d005e6c9a5b38e059452fc5645e5a4', 'retinaface_mnet025_v2'), + ('7dd8111652b7aac2490c5dcddeb268e53ac643e6', 'genderage_v1'), + ] +} + +base_repo_url = 'https://insightface.ai/files/' +_url_format = '{repo_url}models/{file_name}.zip' + + +def short_hash(name): + if name not in _model_sha1: + raise ValueError( + 'Pretrained model for {name} is not available.'.format(name=name)) + return _model_sha1[name][:8] + + +def find_params_file(dir_path): + if not os.path.exists(dir_path): + return None + paths = glob.glob("%s/*.params" % dir_path) + if len(paths) == 0: + return None + paths = sorted(paths) + return paths[-1] + + +def get_model_file(name, root=os.path.join('~', '.insightface', 'models')): + r"""Return location for the pretrained on local file system. + + This function will download from online model zoo when model cannot be found or has mismatch. + The root directory will be created if it doesn't exist. + + Parameters + ---------- + name : str + Name of the model. + root : str, default '~/.mxnet/models' + Location for keeping the model parameters. + + Returns + ------- + file_path + Path to the requested pretrained model file. + """ + + file_name = name + root = os.path.expanduser(root) + dir_path = os.path.join(root, name) + file_path = find_params_file(dir_path) + #file_path = os.path.join(root, file_name + '.params') + sha1_hash = _model_sha1[name] + if file_path is not None: + if check_sha1(file_path, sha1_hash): + return file_path + else: + print( + 'Mismatch in the content of model file detected. Downloading again.' + ) + else: + print('Model file is not found. Downloading.') + + if not os.path.exists(root): + os.makedirs(root) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + zip_file_path = os.path.join(root, file_name + '.zip') + repo_url = base_repo_url + if repo_url[-1] != '/': + repo_url = repo_url + '/' + download(_url_format.format(repo_url=repo_url, file_name=file_name), + path=zip_file_path, + overwrite=True) + with zipfile.ZipFile(zip_file_path) as zf: + zf.extractall(dir_path) + os.remove(zip_file_path) + file_path = find_params_file(dir_path) + + if check_sha1(file_path, sha1_hash): + return file_path + else: + raise ValueError( + 'Downloaded file has different hash. Please try again.') + diff --git a/src/utils/dependencies/insightface/model_zoo/model_zoo.py b/src/utils/dependencies/insightface/model_zoo/model_zoo.py new file mode 100644 index 0000000000000000000000000000000000000000..d8366e2a5461d5d6688f23e102a40944330084a4 --- /dev/null +++ b/src/utils/dependencies/insightface/model_zoo/model_zoo.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# @Organization : insightface.ai +# @Author : Jia Guo +# @Time : 2021-05-04 +# @Function : + +import os +import os.path as osp +import glob +import onnxruntime +from .arcface_onnx import * +from .retinaface import * +#from .scrfd import * +from .landmark import * +from .attribute import Attribute +from .inswapper import INSwapper +from ..utils import download_onnx + +__all__ = ['get_model'] + + +class PickableInferenceSession(onnxruntime.InferenceSession): + # This is a wrapper to make the current InferenceSession class pickable. + def __init__(self, model_path, **kwargs): + super().__init__(model_path, **kwargs) + self.model_path = model_path + + def __getstate__(self): + return {'model_path': self.model_path} + + def __setstate__(self, values): + model_path = values['model_path'] + self.__init__(model_path) + +class ModelRouter: + def __init__(self, onnx_file): + self.onnx_file = onnx_file + + def get_model(self, **kwargs): + session = PickableInferenceSession(self.onnx_file, **kwargs) + # print(f'Applied providers: {session._providers}, with options: {session._provider_options}') + inputs = session.get_inputs() + input_cfg = inputs[0] + input_shape = input_cfg.shape + outputs = session.get_outputs() + + if len(outputs)>=5: + return RetinaFace(model_file=self.onnx_file, session=session) + elif input_shape[2]==192 and input_shape[3]==192: + return Landmark(model_file=self.onnx_file, session=session) + elif input_shape[2]==96 and input_shape[3]==96: + return Attribute(model_file=self.onnx_file, session=session) + elif len(inputs)==2 and input_shape[2]==128 and input_shape[3]==128: + return INSwapper(model_file=self.onnx_file, session=session) + elif input_shape[2]==input_shape[3] and input_shape[2]>=112 and input_shape[2]%16==0: + return ArcFaceONNX(model_file=self.onnx_file, session=session) + else: + #raise RuntimeError('error on model routing') + return None + +def find_onnx_file(dir_path): + if not os.path.exists(dir_path): + return None + paths = glob.glob("%s/*.onnx" % dir_path) + if len(paths) == 0: + return None + paths = sorted(paths) + return paths[-1] + +def get_default_providers(): + return ['CUDAExecutionProvider', 'CPUExecutionProvider'] + +def get_default_provider_options(): + return None + +def get_model(name, **kwargs): + root = kwargs.get('root', '~/.insightface') + root = os.path.expanduser(root) + model_root = osp.join(root, 'models') + allow_download = kwargs.get('download', False) + download_zip = kwargs.get('download_zip', False) + if not name.endswith('.onnx'): + model_dir = os.path.join(model_root, name) + model_file = find_onnx_file(model_dir) + if model_file is None: + return None + else: + model_file = name + if not osp.exists(model_file) and allow_download: + model_file = download_onnx('models', model_file, root=root, download_zip=download_zip) + assert osp.exists(model_file), 'model_file %s should exist'%model_file + assert osp.isfile(model_file), 'model_file %s should be a file'%model_file + router = ModelRouter(model_file) + providers = kwargs.get('providers', get_default_providers()) + provider_options = kwargs.get('provider_options', get_default_provider_options()) + model = router.get_model(providers=providers, provider_options=provider_options) + return model diff --git a/src/utils/dependencies/insightface/model_zoo/retinaface.py b/src/utils/dependencies/insightface/model_zoo/retinaface.py new file mode 100644 index 0000000000000000000000000000000000000000..fc4ad91ed70688b38503127137e928dc7e5433e1 --- /dev/null +++ b/src/utils/dependencies/insightface/model_zoo/retinaface.py @@ -0,0 +1,301 @@ +# -*- coding: utf-8 -*- +# @Organization : insightface.ai +# @Author : Jia Guo +# @Time : 2021-09-18 +# @Function : + +from __future__ import division +import datetime +import numpy as np +import onnx +import onnxruntime +import os +import os.path as osp +import cv2 +import sys + +def softmax(z): + assert len(z.shape) == 2 + s = np.max(z, axis=1) + s = s[:, np.newaxis] # necessary step to do broadcasting + e_x = np.exp(z - s) + div = np.sum(e_x, axis=1) + div = div[:, np.newaxis] # dito + return e_x / div + +def distance2bbox(points, distance, max_shape=None): + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (n, 2), [x, y]. + distance (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). + max_shape (tuple): Shape of the image. + + Returns: + Tensor: Decoded bboxes. + """ + x1 = points[:, 0] - distance[:, 0] + y1 = points[:, 1] - distance[:, 1] + x2 = points[:, 0] + distance[:, 2] + y2 = points[:, 1] + distance[:, 3] + if max_shape is not None: + x1 = x1.clamp(min=0, max=max_shape[1]) + y1 = y1.clamp(min=0, max=max_shape[0]) + x2 = x2.clamp(min=0, max=max_shape[1]) + y2 = y2.clamp(min=0, max=max_shape[0]) + return np.stack([x1, y1, x2, y2], axis=-1) + +def distance2kps(points, distance, max_shape=None): + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (n, 2), [x, y]. + distance (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). + max_shape (tuple): Shape of the image. + + Returns: + Tensor: Decoded bboxes. + """ + preds = [] + for i in range(0, distance.shape[1], 2): + px = points[:, i%2] + distance[:, i] + py = points[:, i%2+1] + distance[:, i+1] + if max_shape is not None: + px = px.clamp(min=0, max=max_shape[1]) + py = py.clamp(min=0, max=max_shape[0]) + preds.append(px) + preds.append(py) + return np.stack(preds, axis=-1) + +class RetinaFace: + def __init__(self, model_file=None, session=None): + import onnxruntime + self.model_file = model_file + self.session = session + self.taskname = 'detection' + if self.session is None: + assert self.model_file is not None + assert osp.exists(self.model_file) + self.session = onnxruntime.InferenceSession(self.model_file, None) + self.center_cache = {} + self.nms_thresh = 0.4 + self.det_thresh = 0.5 + self._init_vars() + + def _init_vars(self): + input_cfg = self.session.get_inputs()[0] + input_shape = input_cfg.shape + #print(input_shape) + if isinstance(input_shape[2], str): + self.input_size = None + else: + self.input_size = tuple(input_shape[2:4][::-1]) + #print('image_size:', self.image_size) + input_name = input_cfg.name + self.input_shape = input_shape + outputs = self.session.get_outputs() + output_names = [] + for o in outputs: + output_names.append(o.name) + self.input_name = input_name + self.output_names = output_names + self.input_mean = 127.5 + self.input_std = 128.0 + #print(self.output_names) + #assert len(outputs)==10 or len(outputs)==15 + self.use_kps = False + self._anchor_ratio = 1.0 + self._num_anchors = 1 + if len(outputs)==6: + self.fmc = 3 + self._feat_stride_fpn = [8, 16, 32] + self._num_anchors = 2 + elif len(outputs)==9: + self.fmc = 3 + self._feat_stride_fpn = [8, 16, 32] + self._num_anchors = 2 + self.use_kps = True + elif len(outputs)==10: + self.fmc = 5 + self._feat_stride_fpn = [8, 16, 32, 64, 128] + self._num_anchors = 1 + elif len(outputs)==15: + self.fmc = 5 + self._feat_stride_fpn = [8, 16, 32, 64, 128] + self._num_anchors = 1 + self.use_kps = True + + def prepare(self, ctx_id, **kwargs): + if ctx_id<0: + self.session.set_providers(['CPUExecutionProvider']) + nms_thresh = kwargs.get('nms_thresh', None) + if nms_thresh is not None: + self.nms_thresh = nms_thresh + det_thresh = kwargs.get('det_thresh', None) + if det_thresh is not None: + self.det_thresh = det_thresh + input_size = kwargs.get('input_size', None) + if input_size is not None: + if self.input_size is not None: + print('warning: det_size is already set in detection model, ignore') + else: + self.input_size = input_size + + def forward(self, img, threshold): + scores_list = [] + bboxes_list = [] + kpss_list = [] + input_size = tuple(img.shape[0:2][::-1]) + blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True) + net_outs = self.session.run(self.output_names, {self.input_name : blob}) + + input_height = blob.shape[2] + input_width = blob.shape[3] + fmc = self.fmc + for idx, stride in enumerate(self._feat_stride_fpn): + scores = net_outs[idx] + bbox_preds = net_outs[idx+fmc] + bbox_preds = bbox_preds * stride + if self.use_kps: + kps_preds = net_outs[idx+fmc*2] * stride + height = input_height // stride + width = input_width // stride + K = height * width + key = (height, width, stride) + if key in self.center_cache: + anchor_centers = self.center_cache[key] + else: + #solution-1, c style: + #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 ) + #for i in range(height): + # anchor_centers[i, :, 1] = i + #for i in range(width): + # anchor_centers[:, i, 0] = i + + #solution-2: + #ax = np.arange(width, dtype=np.float32) + #ay = np.arange(height, dtype=np.float32) + #xv, yv = np.meshgrid(np.arange(width), np.arange(height)) + #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32) + + #solution-3: + anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32) + #print(anchor_centers.shape) + + anchor_centers = (anchor_centers * stride).reshape( (-1, 2) ) + if self._num_anchors>1: + anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) ) + if len(self.center_cache)<100: + self.center_cache[key] = anchor_centers + + pos_inds = np.where(scores>=threshold)[0] + bboxes = distance2bbox(anchor_centers, bbox_preds) + pos_scores = scores[pos_inds] + pos_bboxes = bboxes[pos_inds] + scores_list.append(pos_scores) + bboxes_list.append(pos_bboxes) + if self.use_kps: + kpss = distance2kps(anchor_centers, kps_preds) + #kpss = kps_preds + kpss = kpss.reshape( (kpss.shape[0], -1, 2) ) + pos_kpss = kpss[pos_inds] + kpss_list.append(pos_kpss) + return scores_list, bboxes_list, kpss_list + + def detect(self, img, input_size = None, max_num=0, metric='default'): + assert input_size is not None or self.input_size is not None + input_size = self.input_size if input_size is None else input_size + + im_ratio = float(img.shape[0]) / img.shape[1] + model_ratio = float(input_size[1]) / input_size[0] + if im_ratio>model_ratio: + new_height = input_size[1] + new_width = int(new_height / im_ratio) + else: + new_width = input_size[0] + new_height = int(new_width * im_ratio) + det_scale = float(new_height) / img.shape[0] + resized_img = cv2.resize(img, (new_width, new_height)) + det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 ) + det_img[:new_height, :new_width, :] = resized_img + + scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh) + + scores = np.vstack(scores_list) + scores_ravel = scores.ravel() + order = scores_ravel.argsort()[::-1] + bboxes = np.vstack(bboxes_list) / det_scale + if self.use_kps: + kpss = np.vstack(kpss_list) / det_scale + pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False) + pre_det = pre_det[order, :] + keep = self.nms(pre_det) + det = pre_det[keep, :] + if self.use_kps: + kpss = kpss[order,:,:] + kpss = kpss[keep,:,:] + else: + kpss = None + if max_num > 0 and det.shape[0] > max_num: + area = (det[:, 2] - det[:, 0]) * (det[:, 3] - + det[:, 1]) + img_center = img.shape[0] // 2, img.shape[1] // 2 + offsets = np.vstack([ + (det[:, 0] + det[:, 2]) / 2 - img_center[1], + (det[:, 1] + det[:, 3]) / 2 - img_center[0] + ]) + offset_dist_squared = np.sum(np.power(offsets, 2.0), 0) + if metric=='max': + values = area + else: + values = area - offset_dist_squared * 2.0 # some extra weight on the centering + bindex = np.argsort( + values)[::-1] # some extra weight on the centering + bindex = bindex[0:max_num] + det = det[bindex, :] + if kpss is not None: + kpss = kpss[bindex, :] + return det, kpss + + def nms(self, dets): + thresh = self.nms_thresh + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return keep + +def get_retinaface(name, download=False, root='~/.insightface/models', **kwargs): + if not download: + assert os.path.exists(name) + return RetinaFace(name) + else: + from .model_store import get_model_file + _file = get_model_file("retinaface_%s" % name, root=root) + return retinaface(_file) + + diff --git a/src/utils/dependencies/insightface/model_zoo/scrfd.py b/src/utils/dependencies/insightface/model_zoo/scrfd.py new file mode 100644 index 0000000000000000000000000000000000000000..674db4bba761157592dfb95c5d1638da1099f89c --- /dev/null +++ b/src/utils/dependencies/insightface/model_zoo/scrfd.py @@ -0,0 +1,348 @@ +# -*- coding: utf-8 -*- +# @Organization : insightface.ai +# @Author : Jia Guo +# @Time : 2021-05-04 +# @Function : + +from __future__ import division +import datetime +import numpy as np +import onnx +import onnxruntime +import os +import os.path as osp +import cv2 +import sys + +def softmax(z): + assert len(z.shape) == 2 + s = np.max(z, axis=1) + s = s[:, np.newaxis] # necessary step to do broadcasting + e_x = np.exp(z - s) + div = np.sum(e_x, axis=1) + div = div[:, np.newaxis] # dito + return e_x / div + +def distance2bbox(points, distance, max_shape=None): + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (n, 2), [x, y]. + distance (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). + max_shape (tuple): Shape of the image. + + Returns: + Tensor: Decoded bboxes. + """ + x1 = points[:, 0] - distance[:, 0] + y1 = points[:, 1] - distance[:, 1] + x2 = points[:, 0] + distance[:, 2] + y2 = points[:, 1] + distance[:, 3] + if max_shape is not None: + x1 = x1.clamp(min=0, max=max_shape[1]) + y1 = y1.clamp(min=0, max=max_shape[0]) + x2 = x2.clamp(min=0, max=max_shape[1]) + y2 = y2.clamp(min=0, max=max_shape[0]) + return np.stack([x1, y1, x2, y2], axis=-1) + +def distance2kps(points, distance, max_shape=None): + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (n, 2), [x, y]. + distance (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). + max_shape (tuple): Shape of the image. + + Returns: + Tensor: Decoded bboxes. + """ + preds = [] + for i in range(0, distance.shape[1], 2): + px = points[:, i%2] + distance[:, i] + py = points[:, i%2+1] + distance[:, i+1] + if max_shape is not None: + px = px.clamp(min=0, max=max_shape[1]) + py = py.clamp(min=0, max=max_shape[0]) + preds.append(px) + preds.append(py) + return np.stack(preds, axis=-1) + +class SCRFD: + def __init__(self, model_file=None, session=None): + import onnxruntime + self.model_file = model_file + self.session = session + self.taskname = 'detection' + self.batched = False + if self.session is None: + assert self.model_file is not None + assert osp.exists(self.model_file) + self.session = onnxruntime.InferenceSession(self.model_file, None) + self.center_cache = {} + self.nms_thresh = 0.4 + self.det_thresh = 0.5 + self._init_vars() + + def _init_vars(self): + input_cfg = self.session.get_inputs()[0] + input_shape = input_cfg.shape + #print(input_shape) + if isinstance(input_shape[2], str): + self.input_size = None + else: + self.input_size = tuple(input_shape[2:4][::-1]) + #print('image_size:', self.image_size) + input_name = input_cfg.name + self.input_shape = input_shape + outputs = self.session.get_outputs() + if len(outputs[0].shape) == 3: + self.batched = True + output_names = [] + for o in outputs: + output_names.append(o.name) + self.input_name = input_name + self.output_names = output_names + self.input_mean = 127.5 + self.input_std = 128.0 + #print(self.output_names) + #assert len(outputs)==10 or len(outputs)==15 + self.use_kps = False + self._anchor_ratio = 1.0 + self._num_anchors = 1 + if len(outputs)==6: + self.fmc = 3 + self._feat_stride_fpn = [8, 16, 32] + self._num_anchors = 2 + elif len(outputs)==9: + self.fmc = 3 + self._feat_stride_fpn = [8, 16, 32] + self._num_anchors = 2 + self.use_kps = True + elif len(outputs)==10: + self.fmc = 5 + self._feat_stride_fpn = [8, 16, 32, 64, 128] + self._num_anchors = 1 + elif len(outputs)==15: + self.fmc = 5 + self._feat_stride_fpn = [8, 16, 32, 64, 128] + self._num_anchors = 1 + self.use_kps = True + + def prepare(self, ctx_id, **kwargs): + if ctx_id<0: + self.session.set_providers(['CPUExecutionProvider']) + nms_thresh = kwargs.get('nms_thresh', None) + if nms_thresh is not None: + self.nms_thresh = nms_thresh + det_thresh = kwargs.get('det_thresh', None) + if det_thresh is not None: + self.det_thresh = det_thresh + input_size = kwargs.get('input_size', None) + if input_size is not None: + if self.input_size is not None: + print('warning: det_size is already set in scrfd model, ignore') + else: + self.input_size = input_size + + def forward(self, img, threshold): + scores_list = [] + bboxes_list = [] + kpss_list = [] + input_size = tuple(img.shape[0:2][::-1]) + blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True) + net_outs = self.session.run(self.output_names, {self.input_name : blob}) + + input_height = blob.shape[2] + input_width = blob.shape[3] + fmc = self.fmc + for idx, stride in enumerate(self._feat_stride_fpn): + # If model support batch dim, take first output + if self.batched: + scores = net_outs[idx][0] + bbox_preds = net_outs[idx + fmc][0] + bbox_preds = bbox_preds * stride + if self.use_kps: + kps_preds = net_outs[idx + fmc * 2][0] * stride + # If model doesn't support batching take output as is + else: + scores = net_outs[idx] + bbox_preds = net_outs[idx + fmc] + bbox_preds = bbox_preds * stride + if self.use_kps: + kps_preds = net_outs[idx + fmc * 2] * stride + + height = input_height // stride + width = input_width // stride + K = height * width + key = (height, width, stride) + if key in self.center_cache: + anchor_centers = self.center_cache[key] + else: + #solution-1, c style: + #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 ) + #for i in range(height): + # anchor_centers[i, :, 1] = i + #for i in range(width): + # anchor_centers[:, i, 0] = i + + #solution-2: + #ax = np.arange(width, dtype=np.float32) + #ay = np.arange(height, dtype=np.float32) + #xv, yv = np.meshgrid(np.arange(width), np.arange(height)) + #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32) + + #solution-3: + anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32) + #print(anchor_centers.shape) + + anchor_centers = (anchor_centers * stride).reshape( (-1, 2) ) + if self._num_anchors>1: + anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) ) + if len(self.center_cache)<100: + self.center_cache[key] = anchor_centers + + pos_inds = np.where(scores>=threshold)[0] + bboxes = distance2bbox(anchor_centers, bbox_preds) + pos_scores = scores[pos_inds] + pos_bboxes = bboxes[pos_inds] + scores_list.append(pos_scores) + bboxes_list.append(pos_bboxes) + if self.use_kps: + kpss = distance2kps(anchor_centers, kps_preds) + #kpss = kps_preds + kpss = kpss.reshape( (kpss.shape[0], -1, 2) ) + pos_kpss = kpss[pos_inds] + kpss_list.append(pos_kpss) + return scores_list, bboxes_list, kpss_list + + def detect(self, img, input_size = None, max_num=0, metric='default'): + assert input_size is not None or self.input_size is not None + input_size = self.input_size if input_size is None else input_size + + im_ratio = float(img.shape[0]) / img.shape[1] + model_ratio = float(input_size[1]) / input_size[0] + if im_ratio>model_ratio: + new_height = input_size[1] + new_width = int(new_height / im_ratio) + else: + new_width = input_size[0] + new_height = int(new_width * im_ratio) + det_scale = float(new_height) / img.shape[0] + resized_img = cv2.resize(img, (new_width, new_height)) + det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 ) + det_img[:new_height, :new_width, :] = resized_img + + scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh) + + scores = np.vstack(scores_list) + scores_ravel = scores.ravel() + order = scores_ravel.argsort()[::-1] + bboxes = np.vstack(bboxes_list) / det_scale + if self.use_kps: + kpss = np.vstack(kpss_list) / det_scale + pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False) + pre_det = pre_det[order, :] + keep = self.nms(pre_det) + det = pre_det[keep, :] + if self.use_kps: + kpss = kpss[order,:,:] + kpss = kpss[keep,:,:] + else: + kpss = None + if max_num > 0 and det.shape[0] > max_num: + area = (det[:, 2] - det[:, 0]) * (det[:, 3] - + det[:, 1]) + img_center = img.shape[0] // 2, img.shape[1] // 2 + offsets = np.vstack([ + (det[:, 0] + det[:, 2]) / 2 - img_center[1], + (det[:, 1] + det[:, 3]) / 2 - img_center[0] + ]) + offset_dist_squared = np.sum(np.power(offsets, 2.0), 0) + if metric=='max': + values = area + else: + values = area - offset_dist_squared * 2.0 # some extra weight on the centering + bindex = np.argsort( + values)[::-1] # some extra weight on the centering + bindex = bindex[0:max_num] + det = det[bindex, :] + if kpss is not None: + kpss = kpss[bindex, :] + return det, kpss + + def nms(self, dets): + thresh = self.nms_thresh + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return keep + +def get_scrfd(name, download=False, root='~/.insightface/models', **kwargs): + if not download: + assert os.path.exists(name) + return SCRFD(name) + else: + from .model_store import get_model_file + _file = get_model_file("scrfd_%s" % name, root=root) + return SCRFD(_file) + + +def scrfd_2p5gkps(**kwargs): + return get_scrfd("2p5gkps", download=True, **kwargs) + + +if __name__ == '__main__': + import glob + detector = SCRFD(model_file='./det.onnx') + detector.prepare(-1) + img_paths = ['tests/data/t1.jpg'] + for img_path in img_paths: + img = cv2.imread(img_path) + + for _ in range(1): + ta = datetime.datetime.now() + #bboxes, kpss = detector.detect(img, 0.5, input_size = (640, 640)) + bboxes, kpss = detector.detect(img, 0.5) + tb = datetime.datetime.now() + print('all cost:', (tb-ta).total_seconds()*1000) + print(img_path, bboxes.shape) + if kpss is not None: + print(kpss.shape) + for i in range(bboxes.shape[0]): + bbox = bboxes[i] + x1,y1,x2,y2,score = bbox.astype(np.int) + cv2.rectangle(img, (x1,y1) , (x2,y2) , (255,0,0) , 2) + if kpss is not None: + kps = kpss[i] + for kp in kps: + kp = kp.astype(np.int) + cv2.circle(img, tuple(kp) , 1, (0,0,255) , 2) + filename = img_path.split('/')[-1] + print('output:', filename) + cv2.imwrite('./outputs/%s'%filename, img) + diff --git a/src/utils/dependencies/insightface/utils/__init__.py b/src/utils/dependencies/insightface/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6960431b1bd6db38890e391c4c94dd2182f2e1fd --- /dev/null +++ b/src/utils/dependencies/insightface/utils/__init__.py @@ -0,0 +1,6 @@ +from __future__ import absolute_import + +from .storage import download, ensure_available, download_onnx +from .filesystem import get_model_dir +from .filesystem import makedirs, try_import_dali +from .constant import * diff --git a/src/utils/dependencies/insightface/utils/constant.py b/src/utils/dependencies/insightface/utils/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..8860ff077ae7227235591edfc84c0cdc227a6432 --- /dev/null +++ b/src/utils/dependencies/insightface/utils/constant.py @@ -0,0 +1,3 @@ + +DEFAULT_MP_NAME = 'buffalo_l' + diff --git a/src/utils/dependencies/insightface/utils/download.py b/src/utils/dependencies/insightface/utils/download.py new file mode 100644 index 0000000000000000000000000000000000000000..5cda84dede45b81dcd99161d87792b6c409fa279 --- /dev/null +++ b/src/utils/dependencies/insightface/utils/download.py @@ -0,0 +1,95 @@ +""" +This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/download.py +""" +import os +import hashlib +import requests +from tqdm import tqdm + + +def check_sha1(filename, sha1_hash): + """Check whether the sha1 hash of the file content matches the expected hash. + Parameters + ---------- + filename : str + Path to the file. + sha1_hash : str + Expected sha1 hash in hexadecimal digits. + Returns + ------- + bool + Whether the file content matches the expected hash. + """ + sha1 = hashlib.sha1() + with open(filename, 'rb') as f: + while True: + data = f.read(1048576) + if not data: + break + sha1.update(data) + + sha1_file = sha1.hexdigest() + l = min(len(sha1_file), len(sha1_hash)) + return sha1.hexdigest()[0:l] == sha1_hash[0:l] + + +def download_file(url, path=None, overwrite=False, sha1_hash=None): + """Download an given URL + Parameters + ---------- + url : str + URL to download + path : str, optional + Destination path to store downloaded file. By default stores to the + current directory with same name as in url. + overwrite : bool, optional + Whether to overwrite destination file if already exists. + sha1_hash : str, optional + Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified + but doesn't match. + Returns + ------- + str + The file path of the downloaded file. + """ + if path is None: + fname = url.split('/')[-1] + else: + path = os.path.expanduser(path) + if os.path.isdir(path): + fname = os.path.join(path, url.split('/')[-1]) + else: + fname = path + + if overwrite or not os.path.exists(fname) or ( + sha1_hash and not check_sha1(fname, sha1_hash)): + dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname))) + if not os.path.exists(dirname): + os.makedirs(dirname) + + print('Downloading %s from %s...' % (fname, url)) + r = requests.get(url, stream=True) + if r.status_code != 200: + raise RuntimeError("Failed downloading url %s" % url) + total_length = r.headers.get('content-length') + with open(fname, 'wb') as f: + if total_length is None: # no content length header + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + else: + total_length = int(total_length) + for chunk in tqdm(r.iter_content(chunk_size=1024), + total=int(total_length / 1024. + 0.5), + unit='KB', + unit_scale=False, + dynamic_ncols=True): + f.write(chunk) + + if sha1_hash and not check_sha1(fname, sha1_hash): + raise UserWarning('File {} is downloaded but the content hash does not match. ' \ + 'The repo may be outdated or download may be incomplete. ' \ + 'If the "repo_url" is overridden, consider switching to ' \ + 'the default repo.'.format(fname)) + + return fname diff --git a/src/utils/dependencies/insightface/utils/face_align.py b/src/utils/dependencies/insightface/utils/face_align.py new file mode 100644 index 0000000000000000000000000000000000000000..226628b39cf743947df230feffbb97bf5c585e1d --- /dev/null +++ b/src/utils/dependencies/insightface/utils/face_align.py @@ -0,0 +1,103 @@ +import cv2 +import numpy as np +from skimage import transform as trans + + +arcface_dst = np.array( + [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366], + [41.5493, 92.3655], [70.7299, 92.2041]], + dtype=np.float32) + +def estimate_norm(lmk, image_size=112,mode='arcface'): + assert lmk.shape == (5, 2) + assert image_size%112==0 or image_size%128==0 + if image_size%112==0: + ratio = float(image_size)/112.0 + diff_x = 0 + else: + ratio = float(image_size)/128.0 + diff_x = 8.0*ratio + dst = arcface_dst * ratio + dst[:,0] += diff_x + tform = trans.SimilarityTransform() + tform.estimate(lmk, dst) + M = tform.params[0:2, :] + return M + +def norm_crop(img, landmark, image_size=112, mode='arcface'): + M = estimate_norm(landmark, image_size, mode) + warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0) + return warped + +def norm_crop2(img, landmark, image_size=112, mode='arcface'): + M = estimate_norm(landmark, image_size, mode) + warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0) + return warped, M + +def square_crop(im, S): + if im.shape[0] > im.shape[1]: + height = S + width = int(float(im.shape[1]) / im.shape[0] * S) + scale = float(S) / im.shape[0] + else: + width = S + height = int(float(im.shape[0]) / im.shape[1] * S) + scale = float(S) / im.shape[1] + resized_im = cv2.resize(im, (width, height)) + det_im = np.zeros((S, S, 3), dtype=np.uint8) + det_im[:resized_im.shape[0], :resized_im.shape[1], :] = resized_im + return det_im, scale + + +def transform(data, center, output_size, scale, rotation): + scale_ratio = scale + rot = float(rotation) * np.pi / 180.0 + #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio) + t1 = trans.SimilarityTransform(scale=scale_ratio) + cx = center[0] * scale_ratio + cy = center[1] * scale_ratio + t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy)) + t3 = trans.SimilarityTransform(rotation=rot) + t4 = trans.SimilarityTransform(translation=(output_size / 2, + output_size / 2)) + t = t1 + t2 + t3 + t4 + M = t.params[0:2] + cropped = cv2.warpAffine(data, + M, (output_size, output_size), + borderValue=0.0) + return cropped, M + + +def trans_points2d(pts, M): + new_pts = np.zeros(shape=pts.shape, dtype=np.float32) + for i in range(pts.shape[0]): + pt = pts[i] + new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32) + new_pt = np.dot(M, new_pt) + #print('new_pt', new_pt.shape, new_pt) + new_pts[i] = new_pt[0:2] + + return new_pts + + +def trans_points3d(pts, M): + scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1]) + #print(scale) + new_pts = np.zeros(shape=pts.shape, dtype=np.float32) + for i in range(pts.shape[0]): + pt = pts[i] + new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32) + new_pt = np.dot(M, new_pt) + #print('new_pt', new_pt.shape, new_pt) + new_pts[i][0:2] = new_pt[0:2] + new_pts[i][2] = pts[i][2] * scale + + return new_pts + + +def trans_points(pts, M): + if pts.shape[1] == 2: + return trans_points2d(pts, M) + else: + return trans_points3d(pts, M) + diff --git a/src/utils/dependencies/insightface/utils/filesystem.py b/src/utils/dependencies/insightface/utils/filesystem.py new file mode 100644 index 0000000000000000000000000000000000000000..01e3851975bdcbbf7f5eeb7e68e70a36dc040535 --- /dev/null +++ b/src/utils/dependencies/insightface/utils/filesystem.py @@ -0,0 +1,157 @@ +""" +This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/filesystem.py +""" +import os +import os.path as osp +import errno + + +def get_model_dir(name, root='~/.insightface'): + root = os.path.expanduser(root) + model_dir = osp.join(root, 'models', name) + return model_dir + +def makedirs(path): + """Create directory recursively if not exists. + Similar to `makedir -p`, you can skip checking existence before this function. + + Parameters + ---------- + path : str + Path of the desired dir + """ + try: + os.makedirs(path) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + + +def try_import(package, message=None): + """Try import specified package, with custom message support. + + Parameters + ---------- + package : str + The name of the targeting package. + message : str, default is None + If not None, this function will raise customized error message when import error is found. + + + Returns + ------- + module if found, raise ImportError otherwise + + """ + try: + return __import__(package) + except ImportError as e: + if not message: + raise e + raise ImportError(message) + + +def try_import_cv2(): + """Try import cv2 at runtime. + + Returns + ------- + cv2 module if found. Raise ImportError otherwise + + """ + msg = "cv2 is required, you can install by package manager, e.g. 'apt-get', \ + or `pip install opencv-python --user` (note that this is unofficial PYPI package)." + + return try_import('cv2', msg) + + +def try_import_mmcv(): + """Try import mmcv at runtime. + + Returns + ------- + mmcv module if found. Raise ImportError otherwise + + """ + msg = "mmcv is required, you can install by first `pip install Cython --user` \ + and then `pip install mmcv --user` (note that this is unofficial PYPI package)." + + return try_import('mmcv', msg) + + +def try_import_rarfile(): + """Try import rarfile at runtime. + + Returns + ------- + rarfile module if found. Raise ImportError otherwise + + """ + msg = "rarfile is required, you can install by first `sudo apt-get install unrar` \ + and then `pip install rarfile --user` (note that this is unofficial PYPI package)." + + return try_import('rarfile', msg) + + +def import_try_install(package, extern_url=None): + """Try import the specified package. + If the package not installed, try use pip to install and import if success. + + Parameters + ---------- + package : str + The name of the package trying to import. + extern_url : str or None, optional + The external url if package is not hosted on PyPI. + For example, you can install a package using: + "pip install git+http://github.com/user/repo/tarball/master/egginfo=xxx". + In this case, you can pass the url to the extern_url. + + Returns + ------- + + The imported python module. + + """ + try: + return __import__(package) + except ImportError: + try: + from pip import main as pipmain + except ImportError: + from pip._internal import main as pipmain + + # trying to install package + url = package if extern_url is None else extern_url + pipmain(['install', '--user', + url]) # will raise SystemExit Error if fails + + # trying to load again + try: + return __import__(package) + except ImportError: + import sys + import site + user_site = site.getusersitepackages() + if user_site not in sys.path: + sys.path.append(user_site) + return __import__(package) + return __import__(package) + + +def try_import_dali(): + """Try import NVIDIA DALI at runtime. + """ + try: + dali = __import__('nvidia.dali', fromlist=['pipeline', 'ops', 'types']) + dali.Pipeline = dali.pipeline.Pipeline + except ImportError: + + class dali: + class Pipeline: + def __init__(self): + raise NotImplementedError( + "DALI not found, please check if you installed it correctly." + ) + + return dali diff --git a/src/utils/dependencies/insightface/utils/storage.py b/src/utils/dependencies/insightface/utils/storage.py new file mode 100644 index 0000000000000000000000000000000000000000..5bf37e2d17b28dee2a8839484778815f87fc4a9c --- /dev/null +++ b/src/utils/dependencies/insightface/utils/storage.py @@ -0,0 +1,52 @@ + +import os +import os.path as osp +import zipfile +from .download import download_file + +BASE_REPO_URL = 'https://github.com/deepinsight/insightface/releases/download/v0.7' + +def download(sub_dir, name, force=False, root='~/.insightface'): + _root = os.path.expanduser(root) + dir_path = os.path.join(_root, sub_dir, name) + if osp.exists(dir_path) and not force: + return dir_path + print('download_path:', dir_path) + zip_file_path = os.path.join(_root, sub_dir, name + '.zip') + model_url = "%s/%s.zip"%(BASE_REPO_URL, name) + download_file(model_url, + path=zip_file_path, + overwrite=True) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + with zipfile.ZipFile(zip_file_path) as zf: + zf.extractall(dir_path) + #os.remove(zip_file_path) + return dir_path + +def ensure_available(sub_dir, name, root='~/.insightface'): + return download(sub_dir, name, force=False, root=root) + +def download_onnx(sub_dir, model_file, force=False, root='~/.insightface', download_zip=False): + _root = os.path.expanduser(root) + model_root = osp.join(_root, sub_dir) + new_model_file = osp.join(model_root, model_file) + if osp.exists(new_model_file) and not force: + return new_model_file + if not osp.exists(model_root): + os.makedirs(model_root) + print('download_path:', new_model_file) + if not download_zip: + model_url = "%s/%s"%(BASE_REPO_URL, model_file) + download_file(model_url, + path=new_model_file, + overwrite=True) + else: + model_url = "%s/%s.zip"%(BASE_REPO_URL, model_file) + zip_file_path = new_model_file+".zip" + download_file(model_url, + path=zip_file_path, + overwrite=True) + with zipfile.ZipFile(zip_file_path) as zf: + zf.extractall(model_root) + return new_model_file diff --git a/src/utils/dependencies/insightface/utils/transform.py b/src/utils/dependencies/insightface/utils/transform.py new file mode 100644 index 0000000000000000000000000000000000000000..06531d257b694211a0b9a09c9d741b9b2ff53bfe --- /dev/null +++ b/src/utils/dependencies/insightface/utils/transform.py @@ -0,0 +1,116 @@ +import cv2 +import math +import numpy as np +from skimage import transform as trans + + +def transform(data, center, output_size, scale, rotation): + scale_ratio = scale + rot = float(rotation) * np.pi / 180.0 + #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio) + t1 = trans.SimilarityTransform(scale=scale_ratio) + cx = center[0] * scale_ratio + cy = center[1] * scale_ratio + t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy)) + t3 = trans.SimilarityTransform(rotation=rot) + t4 = trans.SimilarityTransform(translation=(output_size / 2, + output_size / 2)) + t = t1 + t2 + t3 + t4 + M = t.params[0:2] + cropped = cv2.warpAffine(data, + M, (output_size, output_size), + borderValue=0.0) + return cropped, M + + +def trans_points2d(pts, M): + new_pts = np.zeros(shape=pts.shape, dtype=np.float32) + for i in range(pts.shape[0]): + pt = pts[i] + new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32) + new_pt = np.dot(M, new_pt) + #print('new_pt', new_pt.shape, new_pt) + new_pts[i] = new_pt[0:2] + + return new_pts + + +def trans_points3d(pts, M): + scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1]) + #print(scale) + new_pts = np.zeros(shape=pts.shape, dtype=np.float32) + for i in range(pts.shape[0]): + pt = pts[i] + new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32) + new_pt = np.dot(M, new_pt) + #print('new_pt', new_pt.shape, new_pt) + new_pts[i][0:2] = new_pt[0:2] + new_pts[i][2] = pts[i][2] * scale + + return new_pts + + +def trans_points(pts, M): + if pts.shape[1] == 2: + return trans_points2d(pts, M) + else: + return trans_points3d(pts, M) + +def estimate_affine_matrix_3d23d(X, Y): + ''' Using least-squares solution + Args: + X: [n, 3]. 3d points(fixed) + Y: [n, 3]. corresponding 3d points(moving). Y = PX + Returns: + P_Affine: (3, 4). Affine camera matrix (the third row is [0, 0, 0, 1]). + ''' + X_homo = np.hstack((X, np.ones([X.shape[0],1]))) #n x 4 + P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4 + return P + +def P2sRt(P): + ''' decompositing camera matrix P + Args: + P: (3, 4). Affine Camera Matrix. + Returns: + s: scale factor. + R: (3, 3). rotation matrix. + t: (3,). translation. + ''' + t = P[:, 3] + R1 = P[0:1, :3] + R2 = P[1:2, :3] + s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2.0 + r1 = R1/np.linalg.norm(R1) + r2 = R2/np.linalg.norm(R2) + r3 = np.cross(r1, r2) + + R = np.concatenate((r1, r2, r3), 0) + return s, R, t + +def matrix2angle(R): + ''' get three Euler angles from Rotation Matrix + Args: + R: (3,3). rotation matrix + Returns: + x: pitch + y: yaw + z: roll + ''' + sy = math.sqrt(R[0,0] * R[0,0] + R[1,0] * R[1,0]) + + singular = sy < 1e-6 + + if not singular : + x = math.atan2(R[2,1] , R[2,2]) + y = math.atan2(-R[2,0], sy) + z = math.atan2(R[1,0], R[0,0]) + else : + x = math.atan2(-R[1,2], R[1,1]) + y = math.atan2(-R[2,0], sy) + z = 0 + + # rx, ry, rz = np.rad2deg(x), np.rad2deg(y), np.rad2deg(z) + rx, ry, rz = x*180/np.pi, y*180/np.pi, z*180/np.pi + return rx, ry, rz + diff --git a/src/utils/face_analysis_diy.py b/src/utils/face_analysis_diy.py new file mode 100644 index 0000000000000000000000000000000000000000..456be5e5802952fc0cbc01f1fbb13773b5e12aa1 --- /dev/null +++ b/src/utils/face_analysis_diy.py @@ -0,0 +1,79 @@ +# coding: utf-8 + +""" +face detectoin and alignment using InsightFace +""" + +import numpy as np +from .rprint import rlog as log +from .dependencies.insightface.app import FaceAnalysis +from .dependencies.insightface.app.common import Face +from .timer import Timer + + +def sort_by_direction(faces, direction: str = 'large-small', face_center=None): + if len(faces) <= 0: + return faces + + if direction == 'left-right': + return sorted(faces, key=lambda face: face['bbox'][0]) + if direction == 'right-left': + return sorted(faces, key=lambda face: face['bbox'][0], reverse=True) + if direction == 'top-bottom': + return sorted(faces, key=lambda face: face['bbox'][1]) + if direction == 'bottom-top': + return sorted(faces, key=lambda face: face['bbox'][1], reverse=True) + if direction == 'small-large': + return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1])) + if direction == 'large-small': + return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]), reverse=True) + if direction == 'distance-from-retarget-face': + return sorted(faces, key=lambda face: (((face['bbox'][2]+face['bbox'][0])/2-face_center[0])**2+((face['bbox'][3]+face['bbox'][1])/2-face_center[1])**2)**0.5) + return faces + + +class FaceAnalysisDIY(FaceAnalysis): + def __init__(self, name='buffalo_l', root='~/.insightface', allowed_modules=None, **kwargs): + super().__init__(name=name, root=root, allowed_modules=allowed_modules, **kwargs) + + self.timer = Timer() + + def get(self, img_bgr, **kwargs): + max_num = kwargs.get('max_num', 0) # the number of the detected faces, 0 means no limit + flag_do_landmark_2d_106 = kwargs.get('flag_do_landmark_2d_106', True) # whether to do 106-point detection + direction = kwargs.get('direction', 'large-small') # sorting direction + face_center = None + + bboxes, kpss = self.det_model.detect(img_bgr, max_num=max_num, metric='default') + if bboxes.shape[0] == 0: + return [] + ret = [] + for i in range(bboxes.shape[0]): + bbox = bboxes[i, 0:4] + det_score = bboxes[i, 4] + kps = None + if kpss is not None: + kps = kpss[i] + face = Face(bbox=bbox, kps=kps, det_score=det_score) + for taskname, model in self.models.items(): + if taskname == 'detection': + continue + + if (not flag_do_landmark_2d_106) and taskname == 'landmark_2d_106': + continue + + # print(f'taskname: {taskname}') + model.get(img_bgr, face) + ret.append(face) + + ret = sort_by_direction(ret, direction, face_center) + return ret + + def warmup(self): + self.timer.tic() + + img_bgr = np.zeros((512, 512, 3), dtype=np.uint8) + self.get(img_bgr) + + elapse = self.timer.toc() + log(f'FaceAnalysisDIY warmup time: {elapse:.3f}s') diff --git a/src/utils/helper.py b/src/utils/helper.py new file mode 100644 index 0000000000000000000000000000000000000000..4974fc5a6398613aa7a2facccfaafdde0ff40dfb --- /dev/null +++ b/src/utils/helper.py @@ -0,0 +1,154 @@ +# coding: utf-8 + +""" +utility functions and classes to handle feature extraction and model loading +""" + +import os +import os.path as osp +import torch +from collections import OrderedDict + +from ..modules.spade_generator import SPADEDecoder +from ..modules.warping_network import WarpingNetwork +from ..modules.motion_extractor import MotionExtractor +from ..modules.appearance_feature_extractor import AppearanceFeatureExtractor +from ..modules.stitching_retargeting_network import StitchingRetargetingNetwork + + +def suffix(filename): + """a.jpg -> jpg""" + pos = filename.rfind(".") + if pos == -1: + return "" + return filename[pos + 1:] + + +def prefix(filename): + """a.jpg -> a""" + pos = filename.rfind(".") + if pos == -1: + return filename + return filename[:pos] + + +def basename(filename): + """a/b/c.jpg -> c""" + return prefix(osp.basename(filename)) + + +def is_video(file_path): + if file_path.lower().endswith((".mp4", ".mov", ".avi", ".webm")) or osp.isdir(file_path): + return True + return False + + +def is_template(file_path): + if file_path.endswith(".pkl"): + return True + return False + + +def mkdir(d, log=False): + # return self-assined `d`, for one line code + if not osp.exists(d): + os.makedirs(d, exist_ok=True) + if log: + print(f"Make dir: {d}") + return d + + +def squeeze_tensor_to_numpy(tensor): + out = tensor.data.squeeze(0).cpu().numpy() + return out + + +def dct2cuda(dct: dict, device_id: int): + for key in dct: + dct[key] = torch.tensor(dct[key]).cuda(device_id) + return dct + + +def concat_feat(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor: + """ + kp_source: (bs, k, 3) + kp_driving: (bs, k, 3) + Return: (bs, 2k*3) + """ + bs_src = kp_source.shape[0] + bs_dri = kp_driving.shape[0] + assert bs_src == bs_dri, 'batch size must be equal' + + feat = torch.cat([kp_source.view(bs_src, -1), kp_driving.view(bs_dri, -1)], dim=1) + return feat + + +def remove_ddp_dumplicate_key(state_dict): + state_dict_new = OrderedDict() + for key in state_dict.keys(): + state_dict_new[key.replace('module.', '')] = state_dict[key] + return state_dict_new + + +def load_model(ckpt_path, model_config, device, model_type): + model_params = model_config['model_params'][f'{model_type}_params'] + + if model_type == 'appearance_feature_extractor': + model = AppearanceFeatureExtractor(**model_params).cuda(device) + elif model_type == 'motion_extractor': + model = MotionExtractor(**model_params).cuda(device) + elif model_type == 'warping_module': + model = WarpingNetwork(**model_params).cuda(device) + elif model_type == 'spade_generator': + model = SPADEDecoder(**model_params).cuda(device) + elif model_type == 'stitching_retargeting_module': + # Special handling for stitching and retargeting module + config = model_config['model_params']['stitching_retargeting_module_params'] + checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) + + stitcher = StitchingRetargetingNetwork(**config.get('stitching')) + stitcher.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_shoulder'])) + stitcher = stitcher.cuda(device) + stitcher.eval() + + retargetor_lip = StitchingRetargetingNetwork(**config.get('lip')) + retargetor_lip.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_mouth'])) + retargetor_lip = retargetor_lip.cuda(device) + retargetor_lip.eval() + + retargetor_eye = StitchingRetargetingNetwork(**config.get('eye')) + retargetor_eye.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_eye'])) + retargetor_eye = retargetor_eye.cuda(device) + retargetor_eye.eval() + + return { + 'stitching': stitcher, + 'lip': retargetor_lip, + 'eye': retargetor_eye + } + else: + raise ValueError(f"Unknown model type: {model_type}") + + model.load_state_dict(torch.load(ckpt_path, map_location=lambda storage, loc: storage)) + model.eval() + return model + + +# get coefficients of Eqn. 7 +def calculate_transformation(config, s_kp_info, t_0_kp_info, t_i_kp_info, R_s, R_t_0, R_t_i): + if config.relative: + new_rotation = (R_t_i @ R_t_0.permute(0, 2, 1)) @ R_s + new_expression = s_kp_info['exp'] + (t_i_kp_info['exp'] - t_0_kp_info['exp']) + else: + new_rotation = R_t_i + new_expression = t_i_kp_info['exp'] + new_translation = s_kp_info['t'] + (t_i_kp_info['t'] - t_0_kp_info['t']) + new_translation[..., 2].fill_(0) # Keep the z-axis unchanged + new_scale = s_kp_info['scale'] * (t_i_kp_info['scale'] / t_0_kp_info['scale']) + return new_rotation, new_expression, new_translation, new_scale + + +def load_description(fp): + with open(fp, 'r', encoding='utf-8') as f: + content = f.read() + return content diff --git a/src/utils/io.py b/src/utils/io.py new file mode 100644 index 0000000000000000000000000000000000000000..29a7e008759ec8428171a18ac65960fc14397fdc --- /dev/null +++ b/src/utils/io.py @@ -0,0 +1,97 @@ +# coding: utf-8 + +import os +from glob import glob +import os.path as osp +import imageio +import numpy as np +import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False) + + +def load_image_rgb(image_path: str): + if not osp.exists(image_path): + raise FileNotFoundError(f"Image not found: {image_path}") + img = cv2.imread(image_path, cv2.IMREAD_COLOR) + return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + +def load_driving_info(driving_info): + driving_video_ori = [] + + def load_images_from_directory(directory): + image_paths = sorted(glob(osp.join(directory, '*.png')) + glob(osp.join(directory, '*.jpg'))) + return [load_image_rgb(im_path) for im_path in image_paths] + + def load_images_from_video(file_path): + reader = imageio.get_reader(file_path) + return [image for idx, image in enumerate(reader)] + + if osp.isdir(driving_info): + driving_video_ori = load_images_from_directory(driving_info) + elif osp.isfile(driving_info): + driving_video_ori = load_images_from_video(driving_info) + + return driving_video_ori + + +def contiguous(obj): + if not obj.flags.c_contiguous: + obj = obj.copy(order="C") + return obj + + +def resize_to_limit(img: np.ndarray, max_dim=1920, n=2): + """ + ajust the size of the image so that the maximum dimension does not exceed max_dim, and the width and the height of the image are multiples of n. + :param img: the image to be processed. + :param max_dim: the maximum dimension constraint. + :param n: the number that needs to be multiples of. + :return: the adjusted image. + """ + h, w = img.shape[:2] + + # ajust the size of the image according to the maximum dimension + if max_dim > 0 and max(h, w) > max_dim: + if h > w: + new_h = max_dim + new_w = int(w * (max_dim / h)) + else: + new_w = max_dim + new_h = int(h * (max_dim / w)) + img = cv2.resize(img, (new_w, new_h)) + + # ensure that the image dimensions are multiples of n + n = max(n, 1) + new_h = img.shape[0] - (img.shape[0] % n) + new_w = img.shape[1] - (img.shape[1] % n) + + if new_h == 0 or new_w == 0: + # when the width or height is less than n, no need to process + return img + + if new_h != img.shape[0] or new_w != img.shape[1]: + img = img[:new_h, :new_w] + + return img + + +def load_img_online(obj, mode="bgr", **kwargs): + max_dim = kwargs.get("max_dim", 1920) + n = kwargs.get("n", 2) + if isinstance(obj, str): + if mode.lower() == "gray": + img = cv2.imread(obj, cv2.IMREAD_GRAYSCALE) + else: + img = cv2.imread(obj, cv2.IMREAD_COLOR) + else: + img = obj + + # Resize image to satisfy constraints + img = resize_to_limit(img, max_dim=max_dim, n=n) + + if mode.lower() == "bgr": + return contiguous(img) + elif mode.lower() == "rgb": + return contiguous(img[..., ::-1]) + else: + raise Exception(f"Unknown mode {mode}") diff --git a/src/utils/landmark_runner.py b/src/utils/landmark_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..7b0dcbefd491d71aa0ae9d9128fb22660f45f7b0 --- /dev/null +++ b/src/utils/landmark_runner.py @@ -0,0 +1,89 @@ +# coding: utf-8 + +import os.path as osp +import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False) +import torch +import numpy as np +import onnxruntime +from .timer import Timer +from .rprint import rlog +from .crop import crop_image, _transform_pts + + +def make_abs_path(fn): + return osp.join(osp.dirname(osp.realpath(__file__)), fn) + + +def to_ndarray(obj): + if isinstance(obj, torch.Tensor): + return obj.cpu().numpy() + elif isinstance(obj, np.ndarray): + return obj + else: + return np.array(obj) + + +class LandmarkRunner(object): + """landmark runner""" + def __init__(self, **kwargs): + ckpt_path = kwargs.get('ckpt_path') + onnx_provider = kwargs.get('onnx_provider', 'cuda') # ้ป˜่ฎค็”จcuda + device_id = kwargs.get('device_id', 0) + self.dsize = kwargs.get('dsize', 224) + self.timer = Timer() + + if onnx_provider.lower() == 'cuda': + self.session = onnxruntime.InferenceSession( + ckpt_path, providers=[ + ('CUDAExecutionProvider', {'device_id': device_id}) + ] + ) + else: + opts = onnxruntime.SessionOptions() + opts.intra_op_num_threads = 4 # ้ป˜่ฎค็บฟ็จ‹ๆ•ฐไธบ 4 + self.session = onnxruntime.InferenceSession( + ckpt_path, providers=['CPUExecutionProvider'], + sess_options=opts + ) + + def _run(self, inp): + out = self.session.run(None, {'input': inp}) + return out + + def run(self, img_rgb: np.ndarray, lmk=None): + if lmk is not None: + crop_dct = crop_image(img_rgb, lmk, dsize=self.dsize, scale=1.5, vy_ratio=-0.1) + img_crop_rgb = crop_dct['img_crop'] + else: + img_crop_rgb = cv2.resize(img_rgb, (self.dsize, self.dsize)) + scale = max(img_rgb.shape[:2]) / self.dsize + crop_dct = { + 'M_c2o': np.array([ + [scale, 0., 0.], + [0., scale, 0.], + [0., 0., 1.], + ], dtype=np.float32), + } + + inp = (img_crop_rgb.astype(np.float32) / 255.).transpose(2, 0, 1)[None, ...] # HxWx3 (BGR) -> 1x3xHxW (RGB!) + + out_lst = self._run(inp) + out_pts = out_lst[2] + + pts = to_ndarray(out_pts[0]).reshape(-1, 2) * self.dsize # scale to 0-224 + pts = _transform_pts(pts, M=crop_dct['M_c2o']) + + return { + 'pts': pts, # 2d landmarks 203 points + } + + def warmup(self): + # ๆž„้€ dummy image่ฟ›่กŒwarmup + self.timer.tic() + + dummy_image = np.zeros((1, 3, self.dsize, self.dsize), dtype=np.float32) + + _ = self._run(dummy_image) + + elapse = self.timer.toc() + rlog(f'LandmarkRunner warmup time: {elapse:.3f}s') diff --git a/src/utils/resources/mask_template.png b/src/utils/resources/mask_template.png new file mode 100644 index 0000000000000000000000000000000000000000..bca6ca5977ba820d0d2c05b3793c6231cc82e715 Binary files /dev/null and b/src/utils/resources/mask_template.png differ diff --git a/src/utils/retargeting_utils.py b/src/utils/retargeting_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..20a1bdd37a05f8140124db2a1be568683e917653 --- /dev/null +++ b/src/utils/retargeting_utils.py @@ -0,0 +1,54 @@ + +""" +Functions to compute distance ratios between specific pairs of facial landmarks +""" + +import numpy as np + + +def calculate_distance_ratio(lmk: np.ndarray, idx1: int, idx2: int, idx3: int, idx4: int, eps: float = 1e-6) -> np.ndarray: + """ + Calculate the ratio of the distance between two pairs of landmarks. + + Parameters: + lmk (np.ndarray): Landmarks array of shape (B, N, 2). + idx1, idx2, idx3, idx4 (int): Indices of the landmarks. + eps (float): Small value to avoid division by zero. + + Returns: + np.ndarray: Calculated distance ratio. + """ + return (np.linalg.norm(lmk[:, idx1] - lmk[:, idx2], axis=1, keepdims=True) / + (np.linalg.norm(lmk[:, idx3] - lmk[:, idx4], axis=1, keepdims=True) + eps)) + + +def calc_eye_close_ratio(lmk: np.ndarray, target_eye_ratio: np.ndarray = None) -> np.ndarray: + """ + Calculate the eye-close ratio for left and right eyes. + + Parameters: + lmk (np.ndarray): Landmarks array of shape (B, N, 2). + target_eye_ratio (np.ndarray, optional): Additional target eye ratio array to include. + + Returns: + np.ndarray: Concatenated eye-close ratios. + """ + lefteye_close_ratio = calculate_distance_ratio(lmk, 6, 18, 0, 12) + righteye_close_ratio = calculate_distance_ratio(lmk, 30, 42, 24, 36) + if target_eye_ratio is not None: + return np.concatenate([lefteye_close_ratio, righteye_close_ratio, target_eye_ratio], axis=1) + else: + return np.concatenate([lefteye_close_ratio, righteye_close_ratio], axis=1) + + +def calc_lip_close_ratio(lmk: np.ndarray) -> np.ndarray: + """ + Calculate the lip-close ratio. + + Parameters: + lmk (np.ndarray): Landmarks array of shape (B, N, 2). + + Returns: + np.ndarray: Calculated lip-close ratio. + """ + return calculate_distance_ratio(lmk, 90, 102, 48, 66) diff --git a/src/utils/rprint.py b/src/utils/rprint.py new file mode 100644 index 0000000000000000000000000000000000000000..c43a42f9855bbb019725e6c2b6c6c50e6fa4d0c5 --- /dev/null +++ b/src/utils/rprint.py @@ -0,0 +1,16 @@ +# coding: utf-8 + +""" +custom print and log functions +""" + +__all__ = ['rprint', 'rlog'] + +try: + from rich.console import Console + console = Console() + rprint = console.print + rlog = console.log +except: + rprint = print + rlog = print diff --git a/src/utils/timer.py b/src/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..3570fa45d3ff36376471b82a5b3c02efe46eed98 --- /dev/null +++ b/src/utils/timer.py @@ -0,0 +1,29 @@ +# coding: utf-8 + +""" +tools to measure elapsed time +""" + +import time + +class Timer(object): + """A simple timer.""" + + def __init__(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + + def tic(self): + # using time.time instead of time.clock because time time.clock + # does not normalize for multithreading + self.start_time = time.time() + + def toc(self, average=True): + self.diff = time.time() - self.start_time + return self.diff + + def clear(self): + self.start_time = 0. + self.diff = 0. diff --git a/src/utils/video.py b/src/utils/video.py new file mode 100644 index 0000000000000000000000000000000000000000..720e082a873ad38622b9e19a49d9b32fcf7fe578 --- /dev/null +++ b/src/utils/video.py @@ -0,0 +1,139 @@ +# coding: utf-8 + +""" +functions for processing video +""" + +import os.path as osp +import numpy as np +import subprocess +import imageio +import cv2 + +from rich.progress import track +from .helper import prefix +from .rprint import rprint as print + + +def exec_cmd(cmd): + subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + +def images2video(images, wfp, **kwargs): + fps = kwargs.get('fps', 30) + video_format = kwargs.get('format', 'mp4') # default is mp4 format + codec = kwargs.get('codec', 'libx264') # default is libx264 encoding + quality = kwargs.get('quality') # video quality + pixelformat = kwargs.get('pixelformat', 'yuv420p') # video pixel format + image_mode = kwargs.get('image_mode', 'rgb') + macro_block_size = kwargs.get('macro_block_size', 2) + ffmpeg_params = ['-crf', str(kwargs.get('crf', 18))] + + writer = imageio.get_writer( + wfp, fps=fps, format=video_format, + codec=codec, quality=quality, ffmpeg_params=ffmpeg_params, pixelformat=pixelformat, macro_block_size=macro_block_size + ) + + n = len(images) + for i in track(range(n), description='writing', transient=True): + if image_mode.lower() == 'bgr': + writer.append_data(images[i][..., ::-1]) + else: + writer.append_data(images[i]) + + writer.close() + + # print(f':smiley: Dump to {wfp}\n', style="bold green") + print(f'Dump to {wfp}\n') + + +def video2gif(video_fp, fps=30, size=256): + if osp.exists(video_fp): + d = osp.split(video_fp)[0] + fn = prefix(osp.basename(video_fp)) + palette_wfp = osp.join(d, 'palette.png') + gif_wfp = osp.join(d, f'{fn}.gif') + # generate the palette + cmd = f'ffmpeg -i {video_fp} -vf "fps={fps},scale={size}:-1:flags=lanczos,palettegen" {palette_wfp} -y' + exec_cmd(cmd) + # use the palette to generate the gif + cmd = f'ffmpeg -i {video_fp} -i {palette_wfp} -filter_complex "fps={fps},scale={size}:-1:flags=lanczos[x];[x][1:v]paletteuse" {gif_wfp} -y' + exec_cmd(cmd) + else: + print(f'video_fp: {video_fp} not exists!') + + +def merge_audio_video(video_fp, audio_fp, wfp): + if osp.exists(video_fp) and osp.exists(audio_fp): + cmd = f'ffmpeg -i {video_fp} -i {audio_fp} -c:v copy -c:a aac {wfp} -y' + exec_cmd(cmd) + print(f'merge {video_fp} and {audio_fp} to {wfp}') + else: + print(f'video_fp: {video_fp} or audio_fp: {audio_fp} not exists!') + + +def blend(img: np.ndarray, mask: np.ndarray, background_color=(255, 255, 255)): + mask_float = mask.astype(np.float32) / 255. + background_color = np.array(background_color).reshape([1, 1, 3]) + bg = np.ones_like(img) * background_color + img = np.clip(mask_float * img + (1 - mask_float) * bg, 0, 255).astype(np.uint8) + return img + + +def concat_frames(I_p_lst, driving_rgb_lst, img_rgb): + # TODO: add more concat style, e.g., left-down corner driving + out_lst = [] + for idx, _ in track(enumerate(I_p_lst), total=len(I_p_lst), description='Concatenating result...'): + source_image_drived = I_p_lst[idx] + image_drive = driving_rgb_lst[idx] + + # resize images to match source_image_drived shape + h, w, _ = source_image_drived.shape + image_drive_resized = cv2.resize(image_drive, (w, h)) + img_rgb_resized = cv2.resize(img_rgb, (w, h)) + + # concatenate images horizontally + frame = np.concatenate((image_drive_resized, img_rgb_resized, source_image_drived), axis=1) + out_lst.append(frame) + return out_lst + + +class VideoWriter: + def __init__(self, **kwargs): + self.fps = kwargs.get('fps', 30) + self.wfp = kwargs.get('wfp', 'video.mp4') + self.video_format = kwargs.get('format', 'mp4') + self.codec = kwargs.get('codec', 'libx264') + self.quality = kwargs.get('quality') + self.pixelformat = kwargs.get('pixelformat', 'yuv420p') + self.image_mode = kwargs.get('image_mode', 'rgb') + self.ffmpeg_params = kwargs.get('ffmpeg_params') + + self.writer = imageio.get_writer( + self.wfp, fps=self.fps, format=self.video_format, + codec=self.codec, quality=self.quality, + ffmpeg_params=self.ffmpeg_params, pixelformat=self.pixelformat + ) + + def write(self, image): + if self.image_mode.lower() == 'bgr': + self.writer.append_data(image[..., ::-1]) + else: + self.writer.append_data(image) + + def close(self): + if self.writer is not None: + self.writer.close() + + +def change_video_fps(input_file, output_file, fps=20, codec='libx264', crf=5): + cmd = f"ffmpeg -i {input_file} -c:v {codec} -crf {crf} -r {fps} {output_file} -y" + exec_cmd(cmd) + + +def get_fps(filepath): + import ffmpeg + probe = ffmpeg.probe(filepath) + video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None) + fps = eval(video_stream['avg_frame_rate']) + return fps diff --git a/video2template.py b/video2template.py new file mode 100644 index 0000000000000000000000000000000000000000..c18739639f08c8fb1b4f56bf75fc1ac5275b0bd2 --- /dev/null +++ b/video2template.py @@ -0,0 +1,37 @@ +# coding: utf-8 + +""" +[WIP] Pipeline for video template preparation +""" + +import tyro +from src.config.crop_config import CropConfig +from src.config.inference_config import InferenceConfig +from src.config.argument_config import ArgumentConfig +from src.template_maker import TemplateMaker + + +def partial_fields(target_class, kwargs): + return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)}) + + +def main(): + # set tyro theme + tyro.extras.set_accent_color("bright_cyan") + args = tyro.cli(ArgumentConfig) + + # specify configs for inference + inference_cfg = partial_fields(InferenceConfig, args.__dict__) # use attribute of args to initial InferenceConfig + crop_cfg = partial_fields(CropConfig, args.__dict__) # use attribute of args to initial CropConfig + + video_template_maker = TemplateMaker( + inference_cfg=inference_cfg, + crop_cfg=crop_cfg + ) + + # run + video_template_maker.make_motion_template(args.driving_video_path, args.template_output_dir) + + +if __name__ == '__main__': + main()