Skip to content

Commit 0b17049

Browse files
committed
update examples and downloaders
1 parent 0af60b9 commit 0b17049

21 files changed

+409
-100
lines changed

diffsynth/models/__init__.py

+137-20
Original file line numberDiff line numberDiff line change
@@ -48,23 +48,129 @@
4848
],
4949
}
5050
preset_models_on_modelscope = {
51+
# Hunyuan DiT
5152
"HunyuanDiT": [
5253
("modelscope/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
5354
("modelscope/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
5455
("modelscope/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
5556
("modelscope/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
5657
],
58+
# Stable Video Diffusion
5759
"stable-video-diffusion-img2vid-xt": [
5860
("AI-ModelScope/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
5961
],
62+
# ExVideo
6063
"ExVideo-SVD-128f-v1": [
6164
("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
6265
],
66+
# Stable Diffusion
67+
"StableDiffusion_v15": [
68+
("AI-ModelScope/stable-diffusion-v1-5", "v1-5-pruned-emaonly.safetensors", "models/stable_diffusion"),
69+
],
70+
"DreamShaper_8": [
71+
("sd_lora/dreamshaper_8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
72+
],
73+
"AingDiffusion_v12": [
74+
("sd_lora/aingdiffusion_v12", "aingdiffusion_v12.safetensors", "models/stable_diffusion"),
75+
],
76+
"Flat2DAnimerge_v45Sharp": [
77+
("sd_lora/Flat-2D-Animerge", "flat2DAnimerge_v45Sharp.safetensors", "models/stable_diffusion"),
78+
],
79+
# Textual Inversion
80+
"TextualInversion_VeryBadImageNegative_v1.3": [
81+
("sd_lora/verybadimagenegative_v1.3", "verybadimagenegative_v1.3.pt", "models/textual_inversion"),
82+
],
83+
# Stable Diffusion XL
84+
"StableDiffusionXL_v1": [
85+
("AI-ModelScope/stable-diffusion-xl-base-1.0", "sd_xl_base_1.0.safetensors", "models/stable_diffusion_xl"),
86+
],
87+
"BluePencilXL_v200": [
88+
("sd_lora/bluePencilXL_v200", "bluePencilXL_v200.safetensors", "models/stable_diffusion_xl"),
89+
],
90+
"StableDiffusionXL_Turbo": [
91+
("AI-ModelScope/sdxl-turbo", "sd_xl_turbo_1.0_fp16.safetensors", "models/stable_diffusion_xl_turbo"),
92+
],
93+
# ControlNet
94+
"ControlNet_v11f1p_sd15_depth": [
95+
("AI-ModelScope/ControlNet-v1-1", "control_v11f1p_sd15_depth.pth", "models/ControlNet"),
96+
("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
97+
],
98+
"ControlNet_v11p_sd15_softedge": [
99+
("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_softedge.pth", "models/ControlNet"),
100+
("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators")
101+
],
102+
"ControlNet_v11f1e_sd15_tile": [
103+
("AI-ModelScope/ControlNet-v1-1", "control_v11f1e_sd15_tile.pth", "models/ControlNet")
104+
],
105+
"ControlNet_v11p_sd15_lineart": [
106+
("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_lineart.pth", "models/ControlNet"),
107+
("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
108+
("sd_lora/Annotators", "sk_model2.pth", "models/Annotators")
109+
],
110+
# AnimateDiff
111+
"AnimateDiff_v2": [
112+
("Shanghai_AI_Laboratory/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
113+
],
114+
"AnimateDiff_xl_beta": [
115+
("Shanghai_AI_Laboratory/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
116+
],
117+
# RIFE
118+
"RIFE": [
119+
("Damo_XR_Lab/cv_rife_video-frame-interpolation", "flownet.pkl", "models/RIFE"),
120+
],
121+
# Beautiful Prompt
122+
"BeautifulPrompt": [
123+
("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
124+
("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "generation_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
125+
("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "model.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
126+
("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "special_tokens_map.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
127+
("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
128+
("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
129+
],
130+
# Translator
131+
"opus-mt-zh-en": [
132+
("moxying/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
133+
("moxying/opus-mt-zh-en", "generation_config.json", "models/translator/opus-mt-zh-en"),
134+
("moxying/opus-mt-zh-en", "metadata.json", "models/translator/opus-mt-zh-en"),
135+
("moxying/opus-mt-zh-en", "pytorch_model.bin", "models/translator/opus-mt-zh-en"),
136+
("moxying/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
137+
("moxying/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
138+
("moxying/opus-mt-zh-en", "tokenizer_config.json", "models/translator/opus-mt-zh-en"),
139+
("moxying/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
140+
],
141+
# IP-Adapter
142+
"IP-Adapter-SD": [
143+
("AI-ModelScope/IP-Adapter", "models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion/image_encoder"),
144+
("AI-ModelScope/IP-Adapter", "models/ip-adapter_sd15.bin", "models/IpAdapter/stable_diffusion"),
145+
],
146+
"IP-Adapter-SDXL": [
147+
("AI-ModelScope/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"),
148+
("AI-ModelScope/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"),
149+
],
63150
}
64151
Preset_model_id: TypeAlias = Literal[
65152
"HunyuanDiT",
66153
"stable-video-diffusion-img2vid-xt",
67-
"ExVideo-SVD-128f-v1"
154+
"ExVideo-SVD-128f-v1",
155+
"StableDiffusion_v15",
156+
"DreamShaper_8",
157+
"AingDiffusion_v12",
158+
"Flat2DAnimerge_v45Sharp",
159+
"TextualInversion_VeryBadImageNegative_v1.3",
160+
"StableDiffusionXL_v1",
161+
"BluePencilXL_v200",
162+
"StableDiffusionXL_Turbo",
163+
"ControlNet_v11f1p_sd15_depth",
164+
"ControlNet_v11p_sd15_softedge",
165+
"ControlNet_v11f1e_sd15_tile",
166+
"ControlNet_v11p_sd15_lineart",
167+
"AnimateDiff_v2",
168+
"AnimateDiff_xl_beta",
169+
"RIFE",
170+
"BeautifulPrompt",
171+
"opus-mt-zh-en",
172+
"IP-Adapter-SD",
173+
"IP-Adapter-SDXL",
68174
]
69175
Preset_model_website: TypeAlias = Literal[
70176
"HuggingFace",
@@ -80,6 +186,26 @@
80186
}
81187

82188

189+
def download_models(
190+
model_id_list: List[Preset_model_id] = [],
191+
downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
192+
):
193+
downloaded_files = []
194+
for model_id in model_id_list:
195+
for website in downloading_priority:
196+
if model_id in website_to_preset_models[website]:
197+
for model_id, origin_file_path, local_dir in website_to_preset_models[website][model_id]:
198+
# Check if the file is downloaded.
199+
file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
200+
if file_to_download in downloaded_files:
201+
continue
202+
# Download
203+
website_to_download_fn[website](model_id, origin_file_path, local_dir)
204+
if os.path.basename(origin_file_path) in os.listdir(local_dir):
205+
downloaded_files.append(file_to_download)
206+
return downloaded_files
207+
208+
83209
class ModelManager:
84210
def __init__(
85211
self,
@@ -94,28 +220,19 @@ def __init__(
94220
self.model = {}
95221
self.model_path = {}
96222
self.textual_inversion_dict = {}
97-
downloaded_files = self.download_models(model_id_list, downloading_priority)
223+
downloaded_files = download_models(model_id_list, downloading_priority)
98224
self.load_models(downloaded_files + file_path_list)
99225

100-
def download_models(
226+
def load_model_from_origin(
101227
self,
102-
model_id_list: List[Preset_model_id] = [],
103-
downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
228+
download_from: Preset_model_website = "ModelScope",
229+
model_id = "",
230+
origin_file_path = "",
231+
local_dir = ""
104232
):
105-
downloaded_files = []
106-
for model_id in model_id_list:
107-
for website in downloading_priority:
108-
if model_id in website_to_preset_models[website]:
109-
for model_id, origin_file_path, local_dir in website_to_preset_models[website][model_id]:
110-
# Check if the file is downloaded.
111-
file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
112-
if file_to_download in downloaded_files:
113-
continue
114-
# Download
115-
website_to_download_fn[website](model_id, origin_file_path, local_dir)
116-
if os.path.basename(origin_file_path) in os.listdir(local_dir):
117-
downloaded_files.append(file_to_download)
118-
return downloaded_files
233+
website_to_download_fn[download_from](model_id, origin_file_path, local_dir)
234+
file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
235+
self.load_model(file_to_download)
119236

120237
def is_stable_video_diffusion(self, state_dict):
121238
param_name = "model.diffusion_model.output_blocks.9.1.time_stack.0.norm_in.weight"
@@ -158,7 +275,7 @@ def is_sd_lora(self, state_dict):
158275

159276
def is_translator(self, state_dict):
160277
param_name = "model.encoder.layers.5.self_attn_layer_norm.weight"
161-
return param_name in state_dict and len(state_dict) == 254
278+
return param_name in state_dict and len(state_dict) == 258
162279

163280
def is_ipadapter(self, state_dict):
164281
return "image_proj" in state_dict and "ip_adapter" in state_dict and state_dict["image_proj"]["proj.weight"].shape == torch.Size([3072, 1024])

diffsynth/models/sd_ipadapter.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def set_full_adapter(self):
2929

3030
def set_less_adapter(self):
3131
# IP-Adapter for SD v1.5 doesn't support this feature.
32-
self.set_full_adapter(self)
32+
self.set_full_adapter()
3333

3434
def forward(self, hidden_states, scale=1.0):
3535
hidden_states = self.image_proj(hidden_states)

diffsynth/pipelines/stable_diffusion_xl.py

+5
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def __call__(
8787
input_image=None,
8888
ipadapter_images=None,
8989
ipadapter_scale=1.0,
90+
ipadapter_use_instant_style=False,
9091
controlnet_image=None,
9192
denoising_strength=1.0,
9293
height=1024,
@@ -134,6 +135,10 @@ def __call__(
134135

135136
# IP-Adapter
136137
if ipadapter_images is not None:
138+
if ipadapter_use_instant_style:
139+
self.ipadapter.set_less_adapter()
140+
else:
141+
self.ipadapter.set_full_adapter()
137142
ipadapter_image_encoding = self.ipadapter_image_encoder(ipadapter_images)
138143
ipadapter_kwargs_list_posi = self.ipadapter(ipadapter_image_encoding, scale=ipadapter_scale)
139144
ipadapter_kwargs_list_nega = self.ipadapter(torch.zeros_like(ipadapter_image_encoding))

diffsynth/prompts/sdxl_prompter.py

+4
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ def encode_prompt(
4141
add_text_embeds, prompt_emb_2 = text_encoder_2(input_ids_2, clip_skip=clip_skip_2)
4242

4343
# Merge
44+
if prompt_emb_1.shape[0] != prompt_emb_2.shape[0]:
45+
max_batch_size = min(prompt_emb_1.shape[0], prompt_emb_2.shape[0])
46+
prompt_emb_1 = prompt_emb_1[: max_batch_size]
47+
prompt_emb_2 = prompt_emb_2[: max_batch_size]
4448
prompt_emb = torch.concatenate([prompt_emb_1, prompt_emb_2], dim=-1)
4549

4650
# For very long prompt, we only use the first 77 tokens to compute `add_text_embeds`.

examples/Diffutoon/diffutoon_toon_shading.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,21 @@
1-
from diffsynth import SDVideoPipelineRunner
1+
from diffsynth import SDVideoPipelineRunner, download_models
22

33

4-
# Download models
4+
# Download models (automatically)
55
# `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575)
66
# `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt)
77
# `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth)
88
# `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth)
99
# `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth)
1010
# `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth)
1111
# `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16)
12-
12+
download_models([
13+
"AingDiffusion_v12",
14+
"AnimateDiff_v2",
15+
"ControlNet_v11p_sd15_lineart",
16+
"ControlNet_v11f1e_sd15_tile",
17+
"TextualInversion_VeryBadImageNegative_v1.3"
18+
])
1319
# The original video in the example is https://www.bilibili.com/video/BV1iG411a7sQ/.
1420

1521
config = {
@@ -63,7 +69,7 @@
6369
"end_frame_id": 30
6470
}
6571
],
66-
"output_folder": "data/examples/diffutoon/output",
72+
"output_folder": "output",
6773
"fps": 30
6874
},
6975
"pipeline": {

examples/Diffutoon/diffutoon_toon_shading_with_editing_signals.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from diffsynth import SDVideoPipelineRunner
1+
from diffsynth import SDVideoPipelineRunner, download_models
22
import os
33

44

5-
# Download models
5+
# Download models (automatically)
66
# `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575)
77
# `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt)
88
# `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth)
@@ -14,7 +14,15 @@
1414
# `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth)
1515
# `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth)
1616
# `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16)
17-
17+
download_models([
18+
"AingDiffusion_v12",
19+
"AnimateDiff_v2",
20+
"ControlNet_v11p_sd15_lineart",
21+
"ControlNet_v11f1e_sd15_tile",
22+
"ControlNet_v11f1p_sd15_depth",
23+
"ControlNet_v11p_sd15_softedge",
24+
"TextualInversion_VeryBadImageNegative_v1.3"
25+
])
1826
# The original video in the example is https://www.bilibili.com/video/BV1zu4y1s7Ec/.
1927

2028
config_stage_1 = {
@@ -67,7 +75,7 @@
6775
"end_frame_id": 30
6876
}
6977
],
70-
"output_folder": "data/examples/diffutoon_edit/color_video",
78+
"output_folder": "output/color_video",
7179
"fps": 25
7280
},
7381
"smoother_configs": [
@@ -153,7 +161,7 @@
153161
"end_frame_id": 30
154162
}
155163
],
156-
"output_folder": "data/examples/diffutoon_edit/output",
164+
"output_folder": "output/edited_video",
157165
"fps": 30
158166
},
159167
"pipeline": {

examples/Diffutoon/sd_toon_shading.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
1-
from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, save_frames
2-
from diffsynth.extensions.RIFE import RIFESmoother
1+
from diffsynth import ModelManager, SDVideoPipeline, ControlNetConfigUnit, VideoData, save_video, download_models
32
import torch
43

54

6-
# Download models
5+
# Download models (automatically)
76
# `models/stable_diffusion/flat2DAnimerge_v45Sharp.safetensors`: [link](https://civitai.com/api/download/models/266360?type=Model&format=SafeTensor&size=pruned&fp=fp16)
87
# `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt)
98
# `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth)
109
# `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth)
1110
# `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth)
1211
# `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth)
1312
# `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16)
14-
# `models/RIFE/flownet.pkl`: [link](https://drive.google.com/file/d/1APIzVeI-4ZZCEuIRE1m6WYfSCaOsi_7_/view?usp=sharing)
15-
13+
download_models([
14+
"Flat2DAnimerge_v45Sharp",
15+
"AnimateDiff_v2",
16+
"ControlNet_v11p_sd15_lineart",
17+
"ControlNet_v11f1e_sd15_tile",
18+
"TextualInversion_VeryBadImageNegative_v1.3"
19+
])
1620

1721
# Load models
1822
model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
@@ -22,7 +26,6 @@
2226
"models/AnimateDiff/mm_sd_v15_v2.ckpt",
2327
"models/ControlNet/control_v11p_sd15_lineart.pth",
2428
"models/ControlNet/control_v11f1e_sd15_tile.pth",
25-
"models/RIFE/flownet.pkl"
2629
])
2730
pipe = SDVideoPipeline.from_model_manager(
2831
model_manager,
@@ -39,12 +42,11 @@
3942
)
4043
]
4144
)
42-
smoother = RIFESmoother.from_model_manager(model_manager)
4345

4446
# Load video (we only use 60 frames for quick testing)
4547
# The original video is here: https://www.bilibili.com/video/BV19w411A7YJ/
4648
video = VideoData(
47-
video_file="data/bilibili_videos/៸៸᳐_⩊_៸៸᳐ 66 微笑调查队🌻/៸៸᳐_⩊_៸៸᳐ 66 微笑调查队🌻 - 1.66 微笑调查队🌻(Av278681824,P1).mp4",
49+
video_file="data/examples/bilibili/BV19w411A7YJ.mp4",
4850
height=1024, width=1024)
4951
input_video = [video[i] for i in range(40*60, 41*60)]
5052

@@ -59,7 +61,6 @@
5961
animatediff_batch_size=32, animatediff_stride=16,
6062
vram_limit_level=0,
6163
)
62-
output_video = smoother(output_video)
6364

6465
# Save video
6566
save_video(output_video, "output_video.mp4", fps=60)

0 commit comments

Comments
 (0)