mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
* add training code of gligen * fix code quality tests. --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
111 lines
3.8 KiB
Python
111 lines
3.8 KiB
Python
import os
|
|
import random
|
|
|
|
import torch
|
|
import torchvision.transforms as transforms
|
|
from PIL import Image
|
|
|
|
|
|
def recalculate_box_and_verify_if_valid(x, y, w, h, image_size, original_image_size, min_box_size):
|
|
scale = image_size / min(original_image_size)
|
|
crop_y = (original_image_size[1] * scale - image_size) // 2
|
|
crop_x = (original_image_size[0] * scale - image_size) // 2
|
|
x0 = max(x * scale - crop_x, 0)
|
|
y0 = max(y * scale - crop_y, 0)
|
|
x1 = min((x + w) * scale - crop_x, image_size)
|
|
y1 = min((y + h) * scale - crop_y, image_size)
|
|
if (x1 - x0) * (y1 - y0) / (image_size * image_size) < min_box_size:
|
|
return False, (None, None, None, None)
|
|
return True, (x0, y0, x1, y1)
|
|
|
|
|
|
class COCODataset(torch.utils.data.Dataset):
|
|
def __init__(
|
|
self,
|
|
data_path,
|
|
image_path,
|
|
image_size=512,
|
|
min_box_size=0.01,
|
|
max_boxes_per_data=8,
|
|
tokenizer=None,
|
|
):
|
|
super().__init__()
|
|
self.min_box_size = min_box_size
|
|
self.max_boxes_per_data = max_boxes_per_data
|
|
self.image_size = image_size
|
|
self.image_path = image_path
|
|
self.tokenizer = tokenizer
|
|
self.transforms = transforms.Compose(
|
|
[
|
|
transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BILINEAR),
|
|
transforms.CenterCrop(image_size),
|
|
transforms.ToTensor(),
|
|
transforms.Normalize([0.5], [0.5]),
|
|
]
|
|
)
|
|
|
|
self.data_list = torch.load(data_path, map_location="cpu")
|
|
|
|
def __getitem__(self, index):
|
|
if self.max_boxes_per_data > 99:
|
|
assert False, "Are you sure setting such large number of boxes per image?"
|
|
|
|
out = {}
|
|
|
|
data = self.data_list[index]
|
|
image = Image.open(os.path.join(self.image_path, data["file_path"])).convert("RGB")
|
|
original_image_size = image.size
|
|
out["pixel_values"] = self.transforms(image)
|
|
|
|
annos = data["annos"]
|
|
|
|
areas, valid_annos = [], []
|
|
for anno in annos:
|
|
# x, y, w, h = anno['bbox']
|
|
x0, y0, x1, y1 = anno["bbox"]
|
|
x, y, w, h = x0, y0, x1 - x0, y1 - y0
|
|
valid, (x0, y0, x1, y1) = recalculate_box_and_verify_if_valid(
|
|
x, y, w, h, self.image_size, original_image_size, self.min_box_size
|
|
)
|
|
if valid:
|
|
anno["bbox"] = [x0, y0, x1, y1]
|
|
areas.append((x1 - x0) * (y1 - y0))
|
|
valid_annos.append(anno)
|
|
|
|
# Sort according to area and choose the largest N objects
|
|
wanted_idxs = torch.tensor(areas).sort(descending=True)[1]
|
|
wanted_idxs = wanted_idxs[: self.max_boxes_per_data]
|
|
valid_annos = [valid_annos[i] for i in wanted_idxs]
|
|
|
|
out["boxes"] = torch.zeros(self.max_boxes_per_data, 4)
|
|
out["masks"] = torch.zeros(self.max_boxes_per_data)
|
|
out["text_embeddings_before_projection"] = torch.zeros(self.max_boxes_per_data, 768)
|
|
|
|
for i, anno in enumerate(valid_annos):
|
|
out["boxes"][i] = torch.tensor(anno["bbox"]) / self.image_size
|
|
out["masks"][i] = 1
|
|
out["text_embeddings_before_projection"][i] = anno["text_embeddings_before_projection"]
|
|
|
|
prob_drop_boxes = 0.1
|
|
if random.random() < prob_drop_boxes:
|
|
out["masks"][:] = 0
|
|
|
|
caption = random.choice(data["captions"])
|
|
|
|
prob_drop_captions = 0.5
|
|
if random.random() < prob_drop_captions:
|
|
caption = ""
|
|
caption = self.tokenizer(
|
|
caption,
|
|
max_length=self.tokenizer.model_max_length,
|
|
padding="max_length",
|
|
truncation=True,
|
|
return_tensors="pt",
|
|
)
|
|
out["caption"] = caption
|
|
|
|
return out
|
|
|
|
def __len__(self):
|
|
return len(self.data_list)
|