Spaces:

AskUI
/

pta-text-v0.1

Sleeping

App Files Files Community

gitlost-murali commited on Feb 14

Commit

da59cbe

•

1 Parent(s): 33024b0

initial checkpoint inference push

Browse files

Files changed (4) hide show

Dockerfile +24 -0
app.py +116 -0
requirements.txt +4 -0
utils.py +144 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM ubuntu:22.04
+# install curl
+RUN apt-get update && apt-get install -y curl && apt-get install -y git && \
+    curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
+    apt-get install -y git-lfs
+WORKDIR /code
+RUN git lfs clone https://huggingface.co/AskUI/pta-text-0.1 /code/model/
+COPY ./requirements.txt /code/requirements.txt
+RUN apt-get install -y python3 python3-pip
+# RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN pip install --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["python3", "app.py", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import gradio as gr
+from PIL import Image, ImageDraw
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+from transformers import Pix2StructProcessor, Pix2StructVisionModel
+from utils import download_default_font, render_header
+class Pix2StructForRegression(nn.Module):
+    def __init__(self, sourcemodel_path, device):
+        super(Pix2StructForRegression, self).__init__()
+        self.model = Pix2StructVisionModel.from_pretrained(sourcemodel_path)
+        print("Pix2StructForRegression Model is Loaded...")
+        self.regression_layer1 = nn.Linear(768, 1536)
+        self.dropout1 = nn.Dropout(0.1)
+        self.regression_layer2 = nn.Linear(1536, 768)
+        self.dropout2 = nn.Dropout(0.1)
+        self.regression_layer3 = nn.Linear(768, 2)
+        self.device = device
+        print("Regression Layers are Loaded...")
+    def forward(self, *args, **kwargs):
+        outputs = self.model(*args, **kwargs)
+        sequence_output = outputs.last_hidden_state
+        first_token_output = sequence_output[:, 0, :]
+        x = F.relu(self.regression_layer1(first_token_output))
+        x = F.relu(self.regression_layer2(x))
+        regression_output = torch.sigmoid(self.regression_layer3(x))
+        return regression_output
+    def load_state_dict_file(self, checkpoint_path, strict=True):
+        print("Loading Model Weights...")
+        state_dict = torch.load(checkpoint_path, map_location=self.device)
+        self.load_state_dict(state_dict, strict=strict)
+        print("Model Weights are Loaded...")
+class Inference:
+    def __init__(self) -> None:
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model, self.processor = self.load_model_and_processor("matcha-base", "model/pta-text-v0.1.pt")
+        print("Model and Processor are Loaded...")
+    def load_model_and_processor(self, model_name, checkpoint_path):
+        model = Pix2StructForRegression(sourcemodel_path=model_name, device=self.device)
+        model.load_state_dict_file(checkpoint_path=checkpoint_path)
+        model.eval()
+        model = model.to(self.device)
+        processor = Pix2StructProcessor.from_pretrained(model_name, is_vqa=False)
+        return model, processor
+    def prepare_image(self, image, prompt, processor):
+        image = image.resize((1920, 1080))
+        download_default_font_path = download_default_font()
+        rendered_image, _, render_variables = render_header(
+            image=image,
+            header=prompt,
+            bbox={"xmin": 0, "ymin": 0, "xmax": 0, "ymax": 0},
+            font_path=download_default_font_path,
+        )
+        encoding = processor(
+            images=rendered_image,
+            max_patches=2048,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        return encoding, render_variables
+    def predict_coordinates(self, encoding, model, render_variables):
+        with torch.no_grad():
+            pred_regression_outs = model(flattened_patches=encoding["flattened_patches"], attention_mask=encoding["attention_mask"])
+            new_height = render_variables["height"]
+            new_header_height = render_variables["header_height"]
+            new_total_height = render_variables["total_height"]
+            pred_regression_outs[:, 1] = (
+                (pred_regression_outs[:, 1] * new_total_height) - new_header_height
+            ) / new_height
+            pred_coordinates = pred_regression_outs.squeeze().tolist()
+        return pred_coordinates
+    def draw_circle_on_image(self, image, coordinates):
+        x, y = coordinates[0] * image.width, coordinates[1] * image.height
+        print(coordinates)
+        draw = ImageDraw.Draw(image)
+        radius = 5
+        draw.ellipse((x-radius, y-radius, x+radius, y+radius), fill="red")
+        return image
+    def process_image_and_draw_circle(self, image, prompt):
+        encoding, render_variables = self.prepare_image(image, prompt, self.processor)
+        pred_coordinates = self.predict_coordinates(encoding.to(self.device) , self.model, render_variables)
+        result_image = self.draw_circle_on_image(image, pred_coordinates)
+        return result_image
+def main():
+    inference = Inference()
+    print("Model and Processor are Loaded...")
+    # Gradio Interface
+    iface = gr.Interface(
+        fn=inference.process_image_and_draw_circle,
+        inputs=[gr.Image(type="pil", label = "Upload Image"),
+                gr.Textbox(label = "Prompt", placeholder="Enter prompt here...")],
+        outputs=gr.Image(type="pil"),
+        title="Pix2Struct Image Processing",
+        description="Upload an image and enter a prompt to see the model's prediction."
+    )
+    iface.launch()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+transformers
+gradio
+Pillow

utils.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import io
+import os
+import textwrap
+from typing import Dict, Optional, Tuple
+from huggingface_hub import hf_hub_download
+from PIL import Image, ImageDraw, ImageFont
+DEFAULT_FONT_PATH = "ybelkada/fonts"
+def download_default_font():
+    font_path = hf_hub_download(DEFAULT_FONT_PATH, "Arial.TTF")
+    return font_path
+def render_text(
+    text: str,
+    text_size: int = 36,
+    text_color: str = "black",
+    background_color: str = "white",
+    left_padding: int = 5,
+    right_padding: int = 5,
+    top_padding: int = 5,
+    bottom_padding: int = 5,
+    font_bytes: Optional[bytes] = None,
+    font_path: Optional[str] = None,
+) -> Image.Image:
+    """
+    Render text. This script is entirely adapted from the original script that can be found here:
+    https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py
+    Args:
+        text (`str`, *optional*, defaults to ):
+            Text to render.
+        text_size (`int`, *optional*, defaults to 36):
+            Size of the text.
+        text_color (`str`, *optional*, defaults to `"black"`):
+            Color of the text.
+        background_color (`str`, *optional*, defaults to `"white"`):
+            Color of the background.
+        left_padding (`int`, *optional*, defaults to 5):
+            Padding on the left.
+        right_padding (`int`, *optional*, defaults to 5):
+            Padding on the right.
+        top_padding (`int`, *optional*, defaults to 5):
+            Padding on the top.
+        bottom_padding (`int`, *optional*, defaults to 5):
+            Padding on the bottom.
+        font_bytes (`bytes`, *optional*):
+            Bytes of the font to use. If `None`, the default font will be used.
+        font_path (`str`, *optional*):
+            Path to the font to use. If `None`, the default font will be used.
+    """
+    wrapper = textwrap.TextWrapper(
+        width=80
+    )  # Add new lines so that each line is no more than 80 characters.
+    lines = wrapper.wrap(text=text)
+    wrapped_text = "\n".join(lines)
+    if font_bytes is not None and font_path is None:
+        font = io.BytesIO(font_bytes)
+    elif font_path is not None:
+        font = font_path
+    else:
+        font = hf_hub_download(DEFAULT_FONT_PATH, "Arial.TTF")
+        raise ValueError(
+            "Either font_bytes or font_path must be provided. "
+            f"Using default font {font}."
+        )
+    font = ImageFont.truetype(font, encoding="UTF-8", size=text_size)
+    # Use a temporary canvas to determine the width and height in pixels when
+    # rendering the text.
+    temp_draw = ImageDraw.Draw(Image.new("RGB", (1, 1), background_color))
+    _, _, text_width, text_height = temp_draw.textbbox((0, 0), wrapped_text, font)
+    # Create the actual image with a bit of padding around the text.
+    image_width = text_width + left_padding + right_padding
+    image_height = text_height + top_padding + bottom_padding
+    image = Image.new("RGB", (image_width, image_height), background_color)
+    draw = ImageDraw.Draw(image)
+    draw.text(
+        xy=(left_padding, top_padding), text=wrapped_text, fill=text_color, font=font
+    )
+    return image
+# Adapted from https://github.com/google-research/pix2struct/blob/0e1779af0f4db4b652c1d92b3bbd2550a7399123/pix2struct/preprocessing/preprocessing_utils.py#L87
+def render_header(
+    image: Image.Image, header: str, bbox: Dict[str, float], font_path: str, **kwargs
+) -> Tuple[Image.Image, Tuple[float, float, float, float]]:
+    """
+    Renders the input text as a header on the input image and updates the bounding box.
+    Args:
+        image (Image.Image):
+            The image to render the header on.
+        header (str):
+            The header text.
+        bbox (Dict[str,float]):
+            The bounding box in relative position (0-1), format ("x_min": 0,
+                                                                 "y_min": 0,
+                                                                 "x_max": 0,
+                                                                 "y_max": 0).
+        input_data_format (Union[str, ChildProcessError], optional):
+            The data format of the image.
+    Returns:
+        Tuple[Image.Image, Dict[str, float] ]:
+        The image with the header rendered and the updated bounding box.
+    """
+    assert os.path.exists(font_path), f"Font path {font_path} does not exist."
+    header_image = render_text(text=header, font_path=font_path, **kwargs)
+    new_width = max(header_image.width, image.width)
+    new_height = int(image.height * (new_width / image.width))
+    new_header_height = int(header_image.height * (new_width / header_image.width))
+    new_image = Image.new("RGB", (new_width, new_height + new_header_height), "white")
+    new_image.paste(header_image.resize((new_width, new_header_height)), (0, 0))
+    new_image.paste(image.resize((new_width, new_height)), (0, new_header_height))
+    new_total_height = new_image.height
+    new_bbox = {
+        "xmin": bbox["xmin"],
+        "ymin": ((bbox["ymin"] * new_height) + new_header_height)
+        / new_total_height,  # shift y_min down by the header's relative height
+        "xmax": bbox["xmax"],
+        "ymax": ((bbox["ymax"] * new_height) + new_header_height)
+        / new_total_height,  # shift y_min down by the header's relative height
+    }
+    return (
+        new_image,
+        new_bbox,
+        {
+            "width": new_width,
+            "height": new_height,
+            "header_height": new_header_height,
+            "total_height": new_total_height,
+        },
+    )