optimize url detection
This commit is contained in:
@@ -4,7 +4,9 @@ declare(strict_types=1);
|
||||
|
||||
namespace App\Context;
|
||||
|
||||
use RuntimeException;
|
||||
use fivefilters\Readability\Configuration;
|
||||
use fivefilters\Readability\ParseException;
|
||||
use fivefilters\Readability\Readability;
|
||||
|
||||
/**
|
||||
* UrlAnalyzer
|
||||
@@ -65,30 +67,29 @@ final class UrlAnalyzer
|
||||
],
|
||||
]);
|
||||
|
||||
$handle = @fopen($url, 'rb', false, $context);
|
||||
if ($handle === false) {
|
||||
$html = @file_get_contents($url, false, $context);
|
||||
if ($html === false || $html === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
$config = new Configuration();
|
||||
$config->setFixRelativeURLs(true);
|
||||
$config->setOriginalURL($url);
|
||||
|
||||
$readability = new Readability($config);
|
||||
|
||||
try {
|
||||
$html = '';
|
||||
while (!feof($handle) && strlen($html) < $this->maxChars * 2) {
|
||||
$html .= fread($handle, 1024);
|
||||
}
|
||||
} finally {
|
||||
fclose($handle);
|
||||
}
|
||||
|
||||
if ($html === '') {
|
||||
$readability->parse($html);
|
||||
$content = $readability->getContent() ?? '';
|
||||
} catch (ParseException) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Remove script and style blocks
|
||||
$html = preg_replace('~<script[^>]*>.*?</script>~is', '', $html) ?? $html;
|
||||
$html = preg_replace('~<style[^>]*>.*?</style>~is', '', $html) ?? $html;
|
||||
if ($content === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Strip remaining HTML and normalize whitespace
|
||||
$text = strip_tags($html);
|
||||
$text = strip_tags($content);
|
||||
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
|
||||
|
||||
return mb_substr(trim($text), 0, $this->maxChars);
|
||||
|
||||
Reference in New Issue
Block a user