first commit

This commit is contained in:
team 1
2026-02-11 14:15:08 +01:00
parent a4742c2c38
commit aa7d362bc3
58 changed files with 9999 additions and 0 deletions

120
src/Context/UrlAnalyzer.php Normal file
View File

@@ -0,0 +1,120 @@
<?php
declare(strict_types=1);
namespace App\Context;
use RuntimeException;
/**
* UrlAnalyzer
*
* Extracts and analyzes URL content from user prompts in a production-safe way.
*
* Responsibilities:
* - Detect the first URL inside a prompt
* - Fetch remote content with strict limits
* - Clean and normalize readable text
* - Identify trusted internal domains based on URL host
*
* Design constraints:
* - No framework dependencies
* - No prompt or agent logic
* - Defensive against slow or large responses
*/
final class UrlAnalyzer
{
private int $timeoutSeconds = 20;
private int $maxChars = 5000;
/**
* List of trusted internal domains.
* Used for marking content as authoritative.
*/
private array $internalDomains = [
'mitho-media.de',
];
/**
* Extracts readable text from the first URL found in a prompt.
*
* @param string $prompt
* @return string Cleaned page text or empty string on failure
*/
public function extractContentFromPrompt(string $prompt): string
{
if (!preg_match('~https?://\S+|www\.\S+~i', $prompt, $matches)) {
return '';
}
$url = $matches[0];
if (!str_starts_with($url, 'http')) {
$url = 'https://' . $url;
}
$parts = parse_url($url);
if ($parts === false || empty($parts['host'])) {
return '';
}
$context = stream_context_create([
'http' => [
'timeout' => $this->timeoutSeconds,
'user_agent' => 'mithoAgent/1.0',
'ignore_errors' => true,
],
]);
$handle = @fopen($url, 'rb', false, $context);
if ($handle === false) {
return '';
}
try {
$html = '';
while (!feof($handle) && strlen($html) < $this->maxChars * 2) {
$html .= fread($handle, 1024);
}
} finally {
fclose($handle);
}
if ($html === '') {
return '';
}
// Remove script and style blocks
$html = preg_replace('~<script[^>]*>.*?</script>~is', '', $html) ?? $html;
$html = preg_replace('~<style[^>]*>.*?</style>~is', '', $html) ?? $html;
// Strip remaining HTML and normalize whitespace
$text = strip_tags($html);
$text = preg_replace('/\s+/u', ' ', $text) ?? $text;
return mb_substr(trim($text), 0, $this->maxChars);
}
/**
* Determines whether a URL belongs to a trusted internal domain.
*
* @param string $url
* @return bool
*/
public function isInternalDomainUrl(string $url): bool
{
$parts = parse_url($url);
if ($parts === false || empty($parts['host'])) {
return false;
}
$host = mb_strtolower($parts['host']);
foreach ($this->internalDomains as $domain) {
if ($host === $domain || str_ends_with($host, '.' . $domain)) {
return true;
}
}
return false;
}
}