File: /var/www/html/laravel/app/Models/MelPdfFile.php
<?php
namespace App\Models;
use DateTime;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\MultipartStream;
use Illuminate\Database\Eloquent\Model;
/**
* @property int $id
* @property string|null $file_name
* @property string|null $file_path
* @property int $pages_total
* @property int $pages_processed
* @property int $status
*/
class MelPdfFile extends Model
{
public const STATUS_PENDING = 0;
public const STATUS_PROCESSING = 1;
public const STATUS_IMAGES_DOWNLOADED = 2;
public const STATUS_IMAGES_CONVERTED_TO_TEXT = 3;
public const STATUS_MEL_OBJECTS_CREATED = 4;
protected $table = 'mel_pdf_files';
protected $fillable = [
'file_name',
'file_path',
'pages_total',
'pages_converted',
'pages_processed',
'status',
'extract_data_with',
'content',
'log_messages',
];
public function process()
{
try {
$uploadPath = storage_path('app/public/uploads/pdfImages');
$now = new DateTime(date('Y-m-d H:i:s'));
$updateDate = new DateTime($this->updated_at);
$dateDiff = date_diff($updateDate, $now, true);
$minutesLeft = $dateDiff->format("%i");
$hoursLeft = $dateDiff->format("%h");
if ($hoursLeft == 0 && $minutesLeft < 5 && $this->status !== self::STATUS_PENDING) {
return;
}
if ($this->status == self::STATUS_PENDING) {
$this->status = self::STATUS_PROCESSING;
$this->save();
}
$client = new Client();
$multipartData = [];
$response = '';
$inputFile = new \SplFileObject(storage_path('app/public/' . $this->file_path));
// Convert to images and download images from N8N
$currentPage = $this->pages_converted;
while ($currentPage < $this->pages_total && $this->status == self::STATUS_PROCESSING) {
$pdfMultipartData = [
[
'name' => 'file',
'contents' => fopen($inputFile->getPathname(), 'r'),
'filename' => basename($inputFile->getPathname()),
'headers' => [
'Content-Type' => mime_content_type($inputFile->getPathname()),
],
],
[
'name' => 'fileName',
'contents' => md5($this->file_name),
],
[
'name' => 'page',
'contents' => $currentPage, // Starting from 0
],
];
$pdfMultipart = new MultipartStream($pdfMultipartData);
$headers = [
'Content-Type' => 'multipart/form-data; boundary=' . $pdfMultipart->getBoundary(),
];
// Converting to images on local server
$n8nCall = new N8n('POST', '/webhook/pdfToImages', $headers, $pdfMultipart);
$imagesResponse = $client->send($n8nCall);
$imagesResponse = json_decode($imagesResponse->getBody());
if (is_array($imagesResponse)) {
foreach ($imagesResponse as $image) {
if (property_exists($image, 'fileName')) {
$n8nCall = new N8n('GET', '/webhook/readFile?file=' . $image->fileName, $headers, $pdfMultipart);
$savePath = $uploadPath . '/' . $image->fileName;
$fileHandle = fopen($savePath, 'w');
$client->send($n8nCall, [
'sink' => $fileHandle, // Stream response to file
]);
try {
if (is_object($fileHandle)) {
fclose($fileHandle); // TODO: fix "not valid stream resource"
}
} catch (\Exception $e) {
echo $e->getMessage();
}
$this->log_messages .= 'Файл ' . basename($savePath) . ' скачан' . PHP_EOL;
}
}
}
$this->pages_converted++;
$currentPage++;
echo 'Страниц конвертировано: ' . $currentPage . '/' . $this->pages_total . PHP_EOL;
$this->save();
}
$this->status = self::STATUS_IMAGES_DOWNLOADED;
$this->save();
$currentPage = $this->pages_processed;
while ($currentPage < $this->pages_total && $this->status == self::STATUS_IMAGES_DOWNLOADED) {
$imagePath = $uploadPath . '/' . str_replace(' ', '%20', md5($this->file_name)) . "-page-" . $currentPage . ".png";
$content = $this->extractContent($imagePath, $multipartData, $currentPage);
if ($content) {
$this->pages_processed = $currentPage + 1;
$this->content .= $content;
$this->save();
$currentPage++;
} else {
echo "Something went wrong while processing this page.\n";
break;
}
}
$this->status = self::STATUS_IMAGES_CONVERTED_TO_TEXT;
$this->save();
if ($this->pages_processed === $this->pages_total && $this->status == self::STATUS_IMAGES_CONVERTED_TO_TEXT) {
$this->splitContentToMelItems();
$this->status = self::STATUS_MEL_OBJECTS_CREATED;
$this->save();
}
return $response;
} catch (\Exception $e) {
$status = 'error';
echo $message = $e->getMessage() . $e->getTraceAsString();
}
}
public function extractContent($imagePath, $multipartData, $pageNumber)
{
$imageMultipartData = array_merge($multipartData, [
[
'name' => 'file',
'contents' => fopen($imagePath, 'r'),
'filename' => basename($imagePath),
'headers' => [
'Content-Type' => mime_content_type($imagePath),
],
],
]);
$imageMultipart = new MultipartStream($imageMultipartData);
$headers = [
'Content-Type' => 'multipart/form-data; boundary=' . $imageMultipart->getBoundary(),
];
$localLlmServer = $this->extract_data_with == 'local';
$client = new Client();
$textResponse = false;
$retries = 0;
while (!$textResponse
|| !is_object($textResponse)
|| !property_exists($textResponse, 'response')
|| substr_count($textResponse->response, "unable to extract") && $retries < 3) {
// $n8nCall = n:qew N8n('POST', '/webhook/imageToText', $headers, $imageMultipart, $localLlmServer);
try {
// $textResponse = $client->send($n8nCall);
$textResponse = callGptProxiImage([
'image' => base64_encode(file_get_contents($imagePath)),
'key' => env('OPEN_AI_KEY'),
'prompt' => 'Extract all visible text from image. If its not possible - return "unable to extract".',
'model' => 'gpt-4o-mini',
]);
} catch (\Exception $e) {
echo $e->getMessage(); // Catch timeout and errors
}
if (!empty($textResponse) && is_object($textResponse) && property_exists($textResponse, 'choices')) {
$textResponse = $textResponse->choices[0]->message->content; //json_decode($textResponse->getBody());
if (!empty($textResponse)) {
$pageNumber = $pageNumber + 1;
echo $message = "Страница {$pageNumber}/{$this->pages_total} обработана" . PHP_EOL;
$this->log_messages .= $message;
$this->save();
break;
}
} else {
$textResponse = '';
}
$retries++;
if ($retries >= 3) {
$message = 'Не могу обработать картинку ' . $imagePath . PHP_EOL;
$this->log_messages .= $message;
$this->save();
throw new \Exception($message);
}
if ($retries != 1) {
echo $message = "Повторная обработка картинки, попытка: {$retries}" . PHP_EOL;
$this->log_messages .= $message;
$this->save();
}
}
$pageText = ' ' . $textResponse . "\n*NEWPAGE*";
return $pageText;
}
public function splitContentToMelItems()
{
$allPagesText = $this->content;
$parts = preg_split('/\n/', $allPagesText, -1, PREG_SPLIT_NO_EMPTY);
$uploadPath = storage_path('app/public/uploads/pdfImages');
$header = '';
$currentKey = 0;
$pageKey = 0;
$skippedParts = ['```', '---', '```yaml', '```json', '```txt', '```python', '```bash', '```sh', '```javascript', '```html', '```css'];
$previousLine = '';
$finalExtractedTexts = [];
foreach ($parts as $partKey => &$part) {
$part = str_replace('**', '', $part); // Remove markdown formatting
$part = trim($part ?? '');
if (in_array($part, $skippedParts)) {
continue;
}
if (preg_match('/^\d*-\d*(-\d*)*/', $part)) {
echo $message = "Разделяем на строке " . $part . PHP_EOL;
$this->log_messages .= $message;
$this->save();
if (!empty($parts[$partKey + 1]) && preg_match('/^\d*-\d*(-\d*)*/', $parts[$partKey + 1])) {
$parts[$partKey + 1] = $part . " " . $parts[$partKey + 1]; // Merge with next part
echo $message = "Объединяем с предыдущей строкой " . $parts[$partKey + 1] . PHP_EOL;
$this->log_messages .= $message;
$this->save();
continue;
}
if (!empty($finalExtractedTexts[$currentKey]) && substr_count($finalExtractedTexts[$currentKey]['text'], PHP_EOL) <= 1) {
// If one line text
$previousLine = $finalExtractedTexts[$currentKey]['text'];
unset($finalExtractedTexts[$currentKey]);
} else {
$previousLine = '';
if ($currentKey == 0) {
// Header
unset($finalExtractedTexts[$currentKey]);
} else {
// If we encounter a new section, increment the key
$finalExtractedTexts[$currentKey]['text'] = "{$header}{$finalExtractedTexts[$currentKey]['text']}";
}
}
$currentKey++;
} elseif ($currentKey == 0) {
$header .= "{$part}\n";
}
if (!isset($finalExtractedTexts[$currentKey])) {
$image = '/uploads/pdfImages/' . str_replace(' ', '%20', md5($this->file_name)) . "-page-" . $pageKey . ".png";
$finalExtractedTexts[$currentKey] = ['text' => $previousLine, 'pages' => [$pageKey], 'images' => [$image]];
$previousLine = '';
}
// Add last lite from current page
if ((substr_count($part, '*NEWPAGE*')) && $part !== '*NEWPAGE*') { // Not last line
$lastPageLine = $parts[$partKey - 1];
//filter $finalExtractedTexts and if subarray 'pages' contain $pageKey, add $lastPageLine to 'text'
array_walk($finalExtractedTexts, function (&$item) use ($pageKey, $lastPageLine) {
if (isset($item['pages']) && in_array($pageKey, $item['pages'])) {
$item['text'] .= $lastPageLine . PHP_EOL;
}
});
$pages = json_encode($finalExtractedTexts[$pageKey]['pages'] ?? []);
echo $message = "Добавляем строку {$lastPageLine} к странице {$pageKey} <br/>";
$this->log_messages .= $message;
$this->save();
// Add last line to last page if there are two or more items
for ($pageCounter = 1; $pageCounter <= 5; $pageCounter++) {
$itemKey = $currentKey - $pageCounter;
$pages = json_encode($finalExtractedTexts[$itemKey]['pages'] ?? []);
if (!empty($finalExtractedTexts[$itemKey]) && in_array($pageKey, $finalExtractedTexts[$itemKey]['pages'])) {
echo $message = "Добавляем строку {$lastPageLine} к странице {$pageKey}, MEL Item {$itemKey}, Страницы: $pages <br/>";
$this->log_messages .= $message;
$this->save();
$finalExtractedTexts[$itemKey]['text'] .= $lastPageLine . PHP_EOL;
}
}
$pageKey++;
$finalExtractedTexts[$currentKey]['pages'][] = $pageKey;
$image = '/uploads/pdfImages/' . str_replace(' ', '%20', md5($this->file_name)) . "-page-" . $pageKey . ".png";
$finalExtractedTexts[$currentKey]['images'][] = $image;
}
$finalExtractedTexts[$currentKey]['text'] .= $part . PHP_EOL;
}
print_r($finalExtractedTexts);
// For last part
$finalExtractedTexts[$currentKey]['text'] = "{$header}{$finalExtractedTexts[$currentKey]['text']}";
$pattern = '/\b\d{2}(?:-\d{2}){1,}\b/m';
foreach ($finalExtractedTexts as $key => $extractedText) {
if (!preg_match($pattern, $finalExtractedTexts[$key]['text'], $matches)) {
unset($finalExtractedTexts[$key]); // Remove parts without MEL ID
}
}
// Convert extracted texts to Mel JSON
foreach ($finalExtractedTexts as $key => $extractedText) {
$text = $extractedText['text'];
$imageNames = $extractedText['images'];
$imageNames[] = '/' . $this->file_path;
$mel = new Mel();
$mel->file_name = json_encode($imageNames);
$mel->content = $text;
$mel->response = '';
$mel->chatgpt_response = '';
$mel->page_number = $key + 1;
$mel->save();
echo "Mel item #{$mel->id} created<br/>";
}
}
}