1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- model:
- target: sgm.models.diffusion.DiffusionEngine
- params:
- scale_factor: 0.13025
- disable_first_stage_autocast: True
- denoiser_config:
- target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
- params:
- num_idx: 1000
- weighting_config:
- target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
- scaling_config:
- target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
- discretization_config:
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
- network_config:
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
- params:
- adm_in_channels: 2816
- num_classes: sequential
- use_checkpoint: True
- in_channels: 9
- out_channels: 4
- model_channels: 320
- attention_resolutions: [4, 2]
- num_res_blocks: 2
- channel_mult: [1, 2, 4]
- num_head_channels: 64
- use_spatial_transformer: True
- use_linear_in_transformer: True
- transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
- context_dim: 2048
- spatial_transformer_attn_type: softmax-xformers
- legacy: False
- conditioner_config:
- target: sgm.modules.GeneralConditioner
- params:
- emb_models:
- # crossattn cond
- - is_trainable: False
- input_key: txt
- target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
- params:
- layer: hidden
- layer_idx: 11
- # crossattn and vector cond
- - is_trainable: False
- input_key: txt
- target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
- params:
- arch: ViT-bigG-14
- version: laion2b_s39b_b160k
- freeze: True
- layer: penultimate
- always_return_pooled: True
- legacy: False
- # vector cond
- - is_trainable: False
- input_key: original_size_as_tuple
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
- params:
- outdim: 256 # multiplied by two
- # vector cond
- - is_trainable: False
- input_key: crop_coords_top_left
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
- params:
- outdim: 256 # multiplied by two
- # vector cond
- - is_trainable: False
- input_key: target_size_as_tuple
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
- params:
- outdim: 256 # multiplied by two
- first_stage_config:
- target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
- params:
- embed_dim: 4
- monitor: val/rec_loss
- ddconfig:
- attn_type: vanilla-xformers
- double_z: true
- z_channels: 4
- resolution: 256
- in_channels: 3
- out_ch: 3
- ch: 128
- ch_mult: [1, 2, 4, 4]
- num_res_blocks: 2
- attn_resolutions: []
- dropout: 0.0
- lossconfig:
- target: torch.nn.Identity
|