Need Help in Python Script for Running FP8 Model on 8GB VRAM

#206
by shobhit6702 - opened

Hello, Thank you for providing such an amazing model. Im am trying to run fp8 model in 8gb vram and 32gb ram. currently im using this code which gives cuda memory limit error when using model offloading and when using sequential offloading it gives a type error.

Any form of help or guidance would be appreciated

My code:

from optimum.quanto import freeze, qfloat8, quantize
from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL
from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
from transformers import CLIPTextModel, CLIPTokenizer,T5EncoderModel, T5TokenizerFast
import torch

dtype = torch.bfloat16
bfl_repo = "./model"

scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(bfl_repo, subfolder="scheduler")
text_encoder = CLIPTextModel.from_pretrained("./openai")
tokenizer = CLIPTokenizer.from_pretrained("./openai")
text_encoder_2 = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype)
tokenizer_2 = T5TokenizerFast.from_pretrained(bfl_repo, subfolder="tokenizer_2", torch_dtype=dtype)
vae = AutoencoderKL.from_pretrained(bfl_repo, subfolder="vae", torch_dtype=dtype)
transformer = FluxTransformer2DModel.from_pretrained(bfl_repo, subfolder="transformer", torch_dtype=dtype)

quantize(transformer, weights=qfloat8)
freeze(transformer)

quantize(text_encoder_2, weights=qfloat8)
freeze(text_encoder_2)

pipeline = FluxPipeline(
scheduler=scheduler,
text_encoder=text_encoder,
tokenizer=tokenizer,
text_encoder_2=text_encoder_2,
tokenizer_2=tokenizer_2,
vae=vae,
transformer=transformer,
)

pipeline.enable_model_cpu_offload() # give cuda memory limit error
#pipeline.enable_sequential_cpu_offload() # does not work and gives error TypeError: QBytesTensor.new() missing 5 required positional arguments: 'axis', 'size', 'stride', 'data', and 'scale'

prompt = "A cat holding a sign that says hello world"

print('Image Generation Started')
image = pipeline(
prompt,
guidance_scale=3.5,
output_type="pil",
num_inference_steps=20,
generator=torch.Generator("cpu").manual_seed(0)
).images[0]

print('Image Generation Ended')

image.save("flux-fp8-dev.png")

shobhit6702 changed discussion title from Python script to run fp8 model in 8gb vram to Need hlePython script to run fp8 model in 8gb vram
shobhit6702 changed discussion title from Need hlePython script to run fp8 model in 8gb vram to Need Help in Python Script for Running FP8 Model on 8GB VRAM

Sign up or log in to comment