HEX

File: /var/www/html/laravel/app/Models/MelPdfFile.php
<?php

namespace App\Models;

use DateTime;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\MultipartStream;
use Illuminate\Database\Eloquent\Model;

/**
 * @property int $id
 * @property string|null $file_name
 * @property string|null $file_path
 * @property int $pages_total
 * @property int $pages_processed
 * @property int $status
 */
class MelPdfFile extends Model
{

    public const STATUS_PENDING = 0;

    public const STATUS_PROCESSING = 1;

    public const STATUS_IMAGES_DOWNLOADED = 2;

    public const STATUS_IMAGES_CONVERTED_TO_TEXT = 3;

    public const STATUS_MEL_OBJECTS_CREATED = 4;

    protected $table = 'mel_pdf_files';

    protected $fillable = [
        'file_name',
        'file_path',
        'pages_total',
        'pages_converted',
        'pages_processed',
        'status',
        'extract_data_with',
        'content',
        'log_messages',
    ];

    public function process()
    {
        try {
            $uploadPath = storage_path('app/public/uploads/pdfImages');

            $now = new DateTime(date('Y-m-d H:i:s'));
            $updateDate = new DateTime($this->updated_at);

            $dateDiff = date_diff($updateDate, $now, true);
            $minutesLeft = $dateDiff->format("%i");
            $hoursLeft = $dateDiff->format("%h");

            if ($hoursLeft == 0 && $minutesLeft < 5 && $this->status !== self::STATUS_PENDING) {
                return;
            }

            if ($this->status == self::STATUS_PENDING) {
                $this->status = self::STATUS_PROCESSING;
                $this->save();
            }

            $client = new Client();

            $multipartData = [];

            $response = '';
            $inputFile = new \SplFileObject(storage_path('app/public/' . $this->file_path));


            // Convert to images and download images from N8N
            $currentPage = $this->pages_converted;
            while ($currentPage < $this->pages_total && $this->status == self::STATUS_PROCESSING) {
                $pdfMultipartData = [
                    [
                        'name' => 'file',
                        'contents' => fopen($inputFile->getPathname(), 'r'),
                        'filename' => basename($inputFile->getPathname()),
                        'headers' => [
                            'Content-Type' => mime_content_type($inputFile->getPathname()),
                        ],
                    ],
                    [
                        'name' => 'fileName',
                        'contents' => md5($this->file_name),
                    ],
                    [
                        'name' => 'page',
                        'contents' => $currentPage, // Starting from 0
                    ],
                ];

                $pdfMultipart = new MultipartStream($pdfMultipartData);
                $headers = [
                    'Content-Type' => 'multipart/form-data; boundary=' . $pdfMultipart->getBoundary(),
                ];

                // Converting to images on local server
                $n8nCall = new N8n('POST', '/webhook/pdfToImages', $headers, $pdfMultipart);
                $imagesResponse = $client->send($n8nCall);
                $imagesResponse = json_decode($imagesResponse->getBody());

                if (is_array($imagesResponse)) {
                    foreach ($imagesResponse as $image) {
                        if (property_exists($image, 'fileName')) {
                            $n8nCall = new N8n('GET', '/webhook/readFile?file=' . $image->fileName, $headers, $pdfMultipart);
                            $savePath = $uploadPath . '/' . $image->fileName;
                            $fileHandle = fopen($savePath, 'w');

                            $client->send($n8nCall, [
                                'sink' => $fileHandle, // Stream response to file
                            ]);

                            try {
                                if (is_object($fileHandle)) {
                                    fclose($fileHandle); // TODO: fix "not valid stream resource"
                                }
                            } catch (\Exception $e) {
                                echo $e->getMessage();
                            }

                            $this->log_messages .= 'Файл ' . basename($savePath) . ' скачан' . PHP_EOL;
                        }
                    }
                }

                $this->pages_converted++;
                $currentPage++;
                echo 'Страниц конвертировано: ' . $currentPage . '/' . $this->pages_total . PHP_EOL;
                $this->save();
            }

            $this->status = self::STATUS_IMAGES_DOWNLOADED;
            $this->save();

            $currentPage = $this->pages_processed;
            while ($currentPage < $this->pages_total && $this->status == self::STATUS_IMAGES_DOWNLOADED) {
                $imagePath = $uploadPath . '/' . str_replace(' ', '%20', md5($this->file_name)) . "-page-" . $currentPage . ".png";
                $content = $this->extractContent($imagePath, $multipartData, $currentPage);

                if ($content) {
                    $this->pages_processed = $currentPage + 1;
                    $this->content .= $content;
                    $this->save();
                    $currentPage++;
                } else {
                    echo "Something went wrong while processing this page.\n";
                    break;
                }
            }

            $this->status = self::STATUS_IMAGES_CONVERTED_TO_TEXT;
            $this->save();

            if ($this->pages_processed === $this->pages_total && $this->status == self::STATUS_IMAGES_CONVERTED_TO_TEXT) {
                $this->splitContentToMelItems();
                $this->status = self::STATUS_MEL_OBJECTS_CREATED;
                $this->save();
            }

            return $response;
        } catch (\Exception $e) {
            $status = 'error';
            echo $message = $e->getMessage() . $e->getTraceAsString();

        }
    }

    public function extractContent($imagePath, $multipartData, $pageNumber)
    {

        $imageMultipartData = array_merge($multipartData, [
            [
                'name' => 'file',
                'contents' => fopen($imagePath, 'r'),
                'filename' => basename($imagePath),
                'headers' => [
                    'Content-Type' => mime_content_type($imagePath),
                ],
            ],
        ]);

        $imageMultipart = new MultipartStream($imageMultipartData);

        $headers = [
            'Content-Type' => 'multipart/form-data; boundary=' . $imageMultipart->getBoundary(),
        ];

        $localLlmServer = $this->extract_data_with == 'local';
        $client = new Client();

        $textResponse = false;
        $retries = 0;
        while (!$textResponse
            || !is_object($textResponse)
            || !property_exists($textResponse, 'response')
            || substr_count($textResponse->response, "unable to extract") && $retries < 3) {

//            $n8nCall = n:qew N8n('POST', '/webhook/imageToText', $headers, $imageMultipart, $localLlmServer);

            try {
//                $textResponse = $client->send($n8nCall);
                $textResponse = callGptProxiImage([
                    'image' => base64_encode(file_get_contents($imagePath)),
                    'key' => env('OPEN_AI_KEY'),
                    'prompt' => 'Extract all visible text from image. If its not possible - return "unable to extract".',
                    'model' => 'gpt-4o-mini',
                ]);
            } catch (\Exception $e) {
                echo $e->getMessage(); // Catch timeout and errors
            }

            if (!empty($textResponse) && is_object($textResponse) && property_exists($textResponse, 'choices')) {
                $textResponse = $textResponse->choices[0]->message->content; //json_decode($textResponse->getBody());
                if (!empty($textResponse)) {
                    $pageNumber = $pageNumber + 1;
                    echo $message = "Страница {$pageNumber}/{$this->pages_total} обработана" . PHP_EOL;
                    $this->log_messages .= $message;
                    $this->save();
                    break;
                }
            } else {
                $textResponse = '';
            }

            $retries++;
            if ($retries >= 3) {
                $message = 'Не могу обработать картинку ' . $imagePath . PHP_EOL;
                $this->log_messages .= $message;
                $this->save();
                throw new \Exception($message);
            }
            if ($retries != 1) {
                echo $message = "Повторная обработка картинки, попытка: {$retries}" . PHP_EOL;
                $this->log_messages .= $message;
                $this->save();
            }
        }

        $pageText = ' ' . $textResponse . "\n*NEWPAGE*";

        return $pageText;
    }

    public function splitContentToMelItems()
    {
        $allPagesText = $this->content;

        $parts = preg_split('/\n/', $allPagesText, -1, PREG_SPLIT_NO_EMPTY);

        $uploadPath = storage_path('app/public/uploads/pdfImages');
        $header = '';
        $currentKey = 0;
        $pageKey = 0;
        $skippedParts = ['```', '---', '```yaml', '```json', '```txt', '```python', '```bash', '```sh', '```javascript', '```html', '```css'];
        $previousLine = '';
        $finalExtractedTexts = [];
        foreach ($parts as $partKey => &$part) {
            $part = str_replace('**', '', $part); // Remove markdown formatting
            $part = trim($part ?? '');
            if (in_array($part, $skippedParts)) {
                continue;
            }
            if (preg_match('/^\d*-\d*(-\d*)*/', $part)) {
                echo $message = "Разделяем на строке " . $part . PHP_EOL;
                $this->log_messages .= $message;
                $this->save();

                if (!empty($parts[$partKey + 1]) && preg_match('/^\d*-\d*(-\d*)*/', $parts[$partKey + 1])) {
                    $parts[$partKey + 1] = $part . " " . $parts[$partKey + 1]; // Merge with next part
                    echo $message = "Объединяем с предыдущей строкой " . $parts[$partKey + 1] . PHP_EOL;
                    $this->log_messages .= $message;
                    $this->save();
                    continue;
                }

                if (!empty($finalExtractedTexts[$currentKey]) && substr_count($finalExtractedTexts[$currentKey]['text'], PHP_EOL) <= 1) {
                    // If one line text
                    $previousLine = $finalExtractedTexts[$currentKey]['text'];
                    unset($finalExtractedTexts[$currentKey]);
                } else {
                    $previousLine = '';
                    if ($currentKey == 0) {
                        // Header
                        unset($finalExtractedTexts[$currentKey]);
                    } else {
                        // If we encounter a new section, increment the key
                        $finalExtractedTexts[$currentKey]['text'] = "{$header}{$finalExtractedTexts[$currentKey]['text']}";
                    }
                }

                $currentKey++;
            } elseif ($currentKey == 0) {
                $header .= "{$part}\n";
            }

            if (!isset($finalExtractedTexts[$currentKey])) {
                $image = '/uploads/pdfImages/' . str_replace(' ', '%20', md5($this->file_name)) . "-page-" . $pageKey . ".png";
                $finalExtractedTexts[$currentKey] = ['text' => $previousLine, 'pages' => [$pageKey], 'images' => [$image]];
                $previousLine = '';
            }

            // Add last lite from current page
            if ((substr_count($part, '*NEWPAGE*')) && $part !== '*NEWPAGE*') { // Not last line
                $lastPageLine = $parts[$partKey - 1];
                //filter $finalExtractedTexts and if subarray 'pages' contain $pageKey, add $lastPageLine to 'text'
                array_walk($finalExtractedTexts, function (&$item) use ($pageKey, $lastPageLine) {
                    if (isset($item['pages']) && in_array($pageKey, $item['pages'])) {
                        $item['text'] .= $lastPageLine . PHP_EOL;
                    }
                });
                $pages = json_encode($finalExtractedTexts[$pageKey]['pages'] ?? []);
                echo $message = "Добавляем строку {$lastPageLine} к странице {$pageKey} <br/>";
                $this->log_messages .= $message;
                $this->save();

                // Add last line to last page if there are two or more items
                for ($pageCounter = 1; $pageCounter <= 5; $pageCounter++) {
                    $itemKey = $currentKey - $pageCounter;
                    $pages = json_encode($finalExtractedTexts[$itemKey]['pages'] ?? []);
                    if (!empty($finalExtractedTexts[$itemKey]) && in_array($pageKey, $finalExtractedTexts[$itemKey]['pages'])) {
                        echo $message = "Добавляем строку {$lastPageLine} к странице {$pageKey}, MEL Item {$itemKey}, Страницы: $pages <br/>";
                        $this->log_messages .= $message;
                        $this->save();
                        $finalExtractedTexts[$itemKey]['text'] .= $lastPageLine . PHP_EOL;
                    }
                }

                $pageKey++;
                $finalExtractedTexts[$currentKey]['pages'][] = $pageKey;
                $image = '/uploads/pdfImages/' . str_replace(' ', '%20', md5($this->file_name)) . "-page-" . $pageKey . ".png";
                $finalExtractedTexts[$currentKey]['images'][] = $image;

            }

            $finalExtractedTexts[$currentKey]['text'] .= $part . PHP_EOL;
        }

        print_r($finalExtractedTexts);

        // For last part
        $finalExtractedTexts[$currentKey]['text'] = "{$header}{$finalExtractedTexts[$currentKey]['text']}";

        $pattern = '/\b\d{2}(?:-\d{2}){1,}\b/m';
        foreach ($finalExtractedTexts as $key => $extractedText) {
            if (!preg_match($pattern, $finalExtractedTexts[$key]['text'], $matches)) {
                unset($finalExtractedTexts[$key]); // Remove parts without MEL ID
            }
        }

        // Convert extracted texts to Mel JSON
        foreach ($finalExtractedTexts as $key => $extractedText) {
            $text = $extractedText['text'];
            $imageNames = $extractedText['images'];
            $imageNames[] = '/' . $this->file_path;
            $mel = new Mel();
            $mel->file_name = json_encode($imageNames);
            $mel->content = $text;
            $mel->response = '';
            $mel->chatgpt_response = '';
            $mel->page_number = $key + 1;
            $mel->save();

            echo "Mel item #{$mel->id} created<br/>";
        }
    }

}