optimize url detection

This commit is contained in:
team 1
2026-04-14 18:56:20 +02:00
parent fe6eb25d25
commit b452607e2c
4 changed files with 498 additions and 65 deletions

View File

@@ -4,7 +4,9 @@ declare(strict_types=1);
namespace App\Context;
use RuntimeException;
use fivefilters\Readability\Configuration;
use fivefilters\Readability\ParseException;
use fivefilters\Readability\Readability;
/**
* UrlAnalyzer
@@ -65,30 +67,29 @@ final class UrlAnalyzer
],
]);
$handle = @fopen($url, 'rb', false, $context);
if ($handle === false) {
$html = @file_get_contents($url, false, $context);
if ($html === false || $html === '') {
return '';
}
$config = new Configuration();
$config->setFixRelativeURLs(true);
$config->setOriginalURL($url);
$readability = new Readability($config);
try {
$html = '';
while (!feof($handle) && strlen($html) < $this->maxChars * 2) {
$html .= fread($handle, 1024);
}
} finally {
fclose($handle);
}
if ($html === '') {
$readability->parse($html);
$content = $readability->getContent() ?? '';
} catch (ParseException) {
return '';
}
// Remove script and style blocks
$html = preg_replace('~<script[^>]*>.*?</script>~is', '', $html) ?? $html;
$html = preg_replace('~<style[^>]*>.*?</style>~is', '', $html) ?? $html;
if ($content === '') {
return '';
}
// Strip remaining HTML and normalize whitespace
$text = strip_tags($html);
$text = strip_tags($content);
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
return mb_substr(trim($text), 0, $this->maxChars);