122 lines
3.0 KiB
PHP
122 lines
3.0 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Context;
|
|
|
|
use fivefilters\Readability\Configuration;
|
|
use fivefilters\Readability\ParseException;
|
|
use fivefilters\Readability\Readability;
|
|
|
|
/**
|
|
* UrlAnalyzer
|
|
*
|
|
* Extracts and analyzes URL content from user prompts in a production-safe way.
|
|
*
|
|
* Responsibilities:
|
|
* - Detect the first URL inside a prompt
|
|
* - Fetch remote content with strict limits
|
|
* - Clean and normalize readable text
|
|
* - Identify trusted internal domains based on URL host
|
|
*
|
|
* Design constraints:
|
|
* - No framework dependencies
|
|
* - No prompt or agent logic
|
|
* - Defensive against slow or large responses
|
|
*/
|
|
final class UrlAnalyzer
|
|
{
|
|
private int $timeoutSeconds = 20;
|
|
private int $maxChars = 5000;
|
|
|
|
/**
|
|
* List of trusted internal domains.
|
|
* Used for marking content as authoritative.
|
|
*/
|
|
private array $internalDomains = [
|
|
'mitho-media.de',
|
|
];
|
|
|
|
/**
|
|
* Extracts readable text from the first URL found in a prompt.
|
|
*
|
|
* @param string $prompt
|
|
* @return string Cleaned page text or empty string on failure
|
|
*/
|
|
public function extractContentFromPrompt(string $prompt): string
|
|
{
|
|
if (!preg_match('~https?://\S+|www\.\S+~i', $prompt, $matches)) {
|
|
return '';
|
|
}
|
|
|
|
$url = $matches[0];
|
|
if (!str_starts_with($url, 'http')) {
|
|
$url = 'https://' . $url;
|
|
}
|
|
|
|
$parts = parse_url($url);
|
|
if ($parts === false || empty($parts['host'])) {
|
|
return '';
|
|
}
|
|
|
|
$context = stream_context_create([
|
|
'http' => [
|
|
'timeout' => $this->timeoutSeconds,
|
|
'user_agent' => 'mithoAgent/1.0',
|
|
'ignore_errors' => true,
|
|
],
|
|
]);
|
|
|
|
$html = @file_get_contents($url, false, $context);
|
|
if ($html === false || $html === '') {
|
|
return '';
|
|
}
|
|
|
|
$config = new Configuration();
|
|
$config->setFixRelativeURLs(true);
|
|
$config->setOriginalURL($url);
|
|
|
|
$readability = new Readability($config);
|
|
|
|
try {
|
|
$readability->parse($html);
|
|
$content = $readability->getContent() ?? '';
|
|
} catch (ParseException) {
|
|
return '';
|
|
}
|
|
|
|
if ($content === '') {
|
|
return '';
|
|
}
|
|
|
|
$text = strip_tags($content);
|
|
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
|
|
|
|
return mb_substr(trim($text), 0, $this->maxChars);
|
|
}
|
|
|
|
/**
|
|
* Determines whether a URL belongs to a trusted internal domain.
|
|
*
|
|
* @param string $url
|
|
* @return bool
|
|
*/
|
|
public function isInternalDomainUrl(string $url): bool
|
|
{
|
|
$parts = parse_url($url);
|
|
if ($parts === false || empty($parts['host'])) {
|
|
return false;
|
|
}
|
|
|
|
$host = mb_strtolower($parts['host']);
|
|
|
|
foreach ($this->internalDomains as $domain) {
|
|
if ($host === $domain || str_ends_with($host, '.' . $domain)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
}
|