import time

import pdfplumber

from DocumentManager.Constants import NET_ACCESS, TIMESLEEP
from .Document import Document
from DocumentManager.ProgressManager import ProgressManager
from PIL import Image

class PDFDocument(Document):
    def extract_content(self):
        text = ""
        images = []  # to store images
        progress_manager = ProgressManager()

        with pdfplumber.open(self.path) as pdf:
            total_page = len(pdf.pages)
            for i, page in enumerate(pdf.pages, start=1):
                # extract text
                page_text = page.extract_text(layout=True) or ""
                if page_text:
                    text += f"\n\n{page_text}"

                # # extract images
                # for img_obj in page.images:
                #     # get image bounding box
                #     x0, top, x1, bottom = img_obj['x0'], img_obj['top'], img_obj['x1'], img_obj['bottom']
                #     cropped_image = page.within_bbox((x0, top, x1, bottom)).to_image(resolution=300)
                #     pil_img = cropped_image.original
                #     images.append(pil_img)

                # update progress
                info = {
                    "title": "Extracting PDF pages",
                    "completed": i,
                    "id": "PDFTEXTEXTRACT",
                    "total": total_page,
                    "details": {
                        "file": self.path
                    }
                }
                if not NET_ACCESS:
                    time.sleep(TIMESLEEP)
                progress_manager.update(info)

        # clean text
        info = {
            "title": "Cleaning text",
            "completed": 0,
            "id": "PDFCLEAN",
            "total": 1,
            "details": {"file": self.path}
        }
        progress_manager.update(info)
        text = self.clean_text(text)
        text = self.remove_repeated_lines(text)

        # return text and list of images
        return text, images