sd_xl_inpaint.yaml 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. model:
  2. target: sgm.models.diffusion.DiffusionEngine
  3. params:
  4. scale_factor: 0.13025
  5. disable_first_stage_autocast: True
  6. denoiser_config:
  7. target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
  8. params:
  9. num_idx: 1000
  10. weighting_config:
  11. target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
  12. scaling_config:
  13. target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
  14. discretization_config:
  15. target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
  16. network_config:
  17. target: sgm.modules.diffusionmodules.openaimodel.UNetModel
  18. params:
  19. adm_in_channels: 2816
  20. num_classes: sequential
  21. use_checkpoint: True
  22. in_channels: 9
  23. out_channels: 4
  24. model_channels: 320
  25. attention_resolutions: [4, 2]
  26. num_res_blocks: 2
  27. channel_mult: [1, 2, 4]
  28. num_head_channels: 64
  29. use_spatial_transformer: True
  30. use_linear_in_transformer: True
  31. transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
  32. context_dim: 2048
  33. spatial_transformer_attn_type: softmax-xformers
  34. legacy: False
  35. conditioner_config:
  36. target: sgm.modules.GeneralConditioner
  37. params:
  38. emb_models:
  39. # crossattn cond
  40. - is_trainable: False
  41. input_key: txt
  42. target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
  43. params:
  44. layer: hidden
  45. layer_idx: 11
  46. # crossattn and vector cond
  47. - is_trainable: False
  48. input_key: txt
  49. target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
  50. params:
  51. arch: ViT-bigG-14
  52. version: laion2b_s39b_b160k
  53. freeze: True
  54. layer: penultimate
  55. always_return_pooled: True
  56. legacy: False
  57. # vector cond
  58. - is_trainable: False
  59. input_key: original_size_as_tuple
  60. target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
  61. params:
  62. outdim: 256 # multiplied by two
  63. # vector cond
  64. - is_trainable: False
  65. input_key: crop_coords_top_left
  66. target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
  67. params:
  68. outdim: 256 # multiplied by two
  69. # vector cond
  70. - is_trainable: False
  71. input_key: target_size_as_tuple
  72. target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
  73. params:
  74. outdim: 256 # multiplied by two
  75. first_stage_config:
  76. target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
  77. params:
  78. embed_dim: 4
  79. monitor: val/rec_loss
  80. ddconfig:
  81. attn_type: vanilla-xformers
  82. double_z: true
  83. z_channels: 4
  84. resolution: 256
  85. in_channels: 3
  86. out_ch: 3
  87. ch: 128
  88. ch_mult: [1, 2, 4, 4]
  89. num_res_blocks: 2
  90. attn_resolutions: []
  91. dropout: 0.0
  92. lossconfig:
  93. target: torch.nn.Identity