optimize url detection

This commit is contained in:
team 1
2026-04-14 18:56:20 +02:00
parent fe6eb25d25
commit b452607e2c
4 changed files with 498 additions and 65 deletions

View File

@@ -7,10 +7,13 @@
"php": "^8.2", "php": "^8.2",
"ext-ctype": "*", "ext-ctype": "*",
"ext-curl": "*", "ext-curl": "*",
"ext-dom": "*",
"ext-iconv": "*", "ext-iconv": "*",
"ext-libxml": "*",
"doctrine/doctrine-bundle": "^2.18", "doctrine/doctrine-bundle": "^2.18",
"doctrine/doctrine-migrations-bundle": "^3.7", "doctrine/doctrine-migrations-bundle": "^3.7",
"doctrine/orm": "^3.6", "doctrine/orm": "^3.6",
"fivefilters/readability.php": ">=3.0",
"league/commonmark": "^2.8", "league/commonmark": "^2.8",
"smalot/pdfparser": "^2.12", "smalot/pdfparser": "^2.12",
"symfony/asset": "7.4.*", "symfony/asset": "7.4.*",

430
composer.lock generated
View File

@@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "bf516574b65f7c2abdc053c964f769aa", "content-hash": "6b5cec5df97930b08d52d1e9599d125b",
"packages": [ "packages": [
{ {
"name": "dflydev/dot-access-data", "name": "dflydev/dot-access-data",
@@ -1194,6 +1194,71 @@
}, },
"time": "2026-02-08T16:21:46+00:00" "time": "2026-02-08T16:21:46+00:00"
}, },
{
"name": "fivefilters/readability.php",
"version": "v3.3.3",
"source": {
"type": "git",
"url": "https://github.com/fivefilters/readability.php.git",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/fivefilters/readability.php/zipball/e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"reference": "e2ee7b9e49eae89ac7ed2c74b15718100a73b4c8",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-mbstring": "*",
"ext-xml": "*",
"league/uri": "^7.0",
"masterminds/html5": "^2.0",
"php": ">=8.1",
"psr/log": "^1.0 || ^2.0 || ^3.0"
},
"require-dev": {
"monolog/monolog": "^3.0",
"phpunit/phpunit": "^10.0 || ^11.0"
},
"suggest": {
"monolog/monolog": "Allow logging debug information"
},
"type": "library",
"autoload": {
"psr-4": {
"fivefilters\\Readability\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Apache-2.0"
],
"authors": [
{
"name": "Andres Rey",
"email": "andreskrey@gmail.com",
"role": "Original Developer"
},
{
"name": "Keyvan Minoukadeh",
"email": "keyvan@fivefilters.org",
"homepage": "https://www.fivefilters.org",
"role": "Developer/Maintainer"
}
],
"description": "A PHP port of Readability.js",
"homepage": "https://github.com/fivefilters/readability.php",
"keywords": [
"html",
"readability"
],
"support": {
"issues": "https://github.com/fivefilters/readability.php/issues",
"source": "https://github.com/fivefilters/readability.php/tree/v3.3.3"
},
"time": "2025-04-26T23:45:37+00:00"
},
{ {
"name": "league/commonmark", "name": "league/commonmark",
"version": "2.8.0", "version": "2.8.0",
@@ -1383,6 +1448,255 @@
], ],
"time": "2022-12-11T20:36:23+00:00" "time": "2022-12-11T20:36:23+00:00"
}, },
{
"name": "league/uri",
"version": "7.8.1",
"source": {
"type": "git",
"url": "https://github.com/thephpleague/uri.git",
"reference": "08cf38e3924d4f56238125547b5720496fac8fd4"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/thephpleague/uri/zipball/08cf38e3924d4f56238125547b5720496fac8fd4",
"reference": "08cf38e3924d4f56238125547b5720496fac8fd4",
"shasum": ""
},
"require": {
"league/uri-interfaces": "^7.8.1",
"php": "^8.1",
"psr/http-factory": "^1"
},
"conflict": {
"league/uri-schemes": "^1.0"
},
"suggest": {
"ext-bcmath": "to improve IPV4 host parsing",
"ext-dom": "to convert the URI into an HTML anchor tag",
"ext-fileinfo": "to create Data URI from file contennts",
"ext-gmp": "to improve IPV4 host parsing",
"ext-intl": "to handle IDN host with the best performance",
"ext-uri": "to use the PHP native URI class",
"jeremykendall/php-domain-parser": "to further parse the URI host and resolve its Public Suffix and Top Level Domain",
"league/uri-components": "to provide additional tools to manipulate URI objects components",
"league/uri-polyfill": "to backport the PHP URI extension for older versions of PHP",
"php-64bit": "to improve IPV4 host parsing",
"rowbot/url": "to handle URLs using the WHATWG URL Living Standard specification",
"symfony/polyfill-intl-idn": "to handle IDN host via the Symfony polyfill if ext-intl is not present"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "7.x-dev"
}
},
"autoload": {
"psr-4": {
"League\\Uri\\": ""
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Ignace Nyamagana Butera",
"email": "nyamsprod@gmail.com",
"homepage": "https://nyamsprod.com"
}
],
"description": "URI manipulation library",
"homepage": "https://uri.thephpleague.com",
"keywords": [
"URN",
"data-uri",
"file-uri",
"ftp",
"hostname",
"http",
"https",
"middleware",
"parse_str",
"parse_url",
"psr-7",
"query-string",
"querystring",
"rfc2141",
"rfc3986",
"rfc3987",
"rfc6570",
"rfc8141",
"uri",
"uri-template",
"url",
"ws"
],
"support": {
"docs": "https://uri.thephpleague.com",
"forum": "https://thephpleague.slack.com",
"issues": "https://github.com/thephpleague/uri-src/issues",
"source": "https://github.com/thephpleague/uri/tree/7.8.1"
},
"funding": [
{
"url": "https://github.com/sponsors/nyamsprod",
"type": "github"
}
],
"time": "2026-03-15T20:22:25+00:00"
},
{
"name": "league/uri-interfaces",
"version": "7.8.1",
"source": {
"type": "git",
"url": "https://github.com/thephpleague/uri-interfaces.git",
"reference": "85d5c77c5d6d3af6c54db4a78246364908f3c928"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/thephpleague/uri-interfaces/zipball/85d5c77c5d6d3af6c54db4a78246364908f3c928",
"reference": "85d5c77c5d6d3af6c54db4a78246364908f3c928",
"shasum": ""
},
"require": {
"ext-filter": "*",
"php": "^8.1",
"psr/http-message": "^1.1 || ^2.0"
},
"suggest": {
"ext-bcmath": "to improve IPV4 host parsing",
"ext-gmp": "to improve IPV4 host parsing",
"ext-intl": "to handle IDN host with the best performance",
"php-64bit": "to improve IPV4 host parsing",
"rowbot/url": "to handle URLs using the WHATWG URL Living Standard specification",
"symfony/polyfill-intl-idn": "to handle IDN host via the Symfony polyfill if ext-intl is not present"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "7.x-dev"
}
},
"autoload": {
"psr-4": {
"League\\Uri\\": ""
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Ignace Nyamagana Butera",
"email": "nyamsprod@gmail.com",
"homepage": "https://nyamsprod.com"
}
],
"description": "Common tools for parsing and resolving RFC3987/RFC3986 URI",
"homepage": "https://uri.thephpleague.com",
"keywords": [
"data-uri",
"file-uri",
"ftp",
"hostname",
"http",
"https",
"parse_str",
"parse_url",
"psr-7",
"query-string",
"querystring",
"rfc3986",
"rfc3987",
"rfc6570",
"uri",
"url",
"ws"
],
"support": {
"docs": "https://uri.thephpleague.com",
"forum": "https://thephpleague.slack.com",
"issues": "https://github.com/thephpleague/uri-src/issues",
"source": "https://github.com/thephpleague/uri-interfaces/tree/7.8.1"
},
"funding": [
{
"url": "https://github.com/sponsors/nyamsprod",
"type": "github"
}
],
"time": "2026-03-08T20:05:35+00:00"
},
{
"name": "masterminds/html5",
"version": "2.10.0",
"source": {
"type": "git",
"url": "https://github.com/Masterminds/html5-php.git",
"reference": "fcf91eb64359852f00d921887b219479b4f21251"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251",
"reference": "fcf91eb64359852f00d921887b219479b4f21251",
"shasum": ""
},
"require": {
"ext-dom": "*",
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.7-dev"
}
},
"autoload": {
"psr-4": {
"Masterminds\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Matt Butcher",
"email": "technosophos@gmail.com"
},
{
"name": "Matt Farina",
"email": "matt@mattfarina.com"
},
{
"name": "Asmir Mustafic",
"email": "goetas@gmail.com"
}
],
"description": "An HTML5 parser and serializer.",
"homepage": "http://masterminds.github.io/html5-php",
"keywords": [
"HTML5",
"dom",
"html",
"parser",
"querypath",
"serializer",
"xml"
],
"support": {
"issues": "https://github.com/Masterminds/html5-php/issues",
"source": "https://github.com/Masterminds/html5-php/tree/2.10.0"
},
"time": "2025-07-25T09:04:22+00:00"
},
{ {
"name": "monolog/monolog", "name": "monolog/monolog",
"version": "3.10.0", "version": "3.10.0",
@@ -1844,6 +2158,114 @@
}, },
"time": "2019-01-08T18:20:26+00:00" "time": "2019-01-08T18:20:26+00:00"
}, },
{
"name": "psr/http-factory",
"version": "1.1.0",
"source": {
"type": "git",
"url": "https://github.com/php-fig/http-factory.git",
"reference": "2b4765fddfe3b508ac62f829e852b1501d3f6e8a"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/php-fig/http-factory/zipball/2b4765fddfe3b508ac62f829e852b1501d3f6e8a",
"reference": "2b4765fddfe3b508ac62f829e852b1501d3f6e8a",
"shasum": ""
},
"require": {
"php": ">=7.1",
"psr/http-message": "^1.0 || ^2.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.0.x-dev"
}
},
"autoload": {
"psr-4": {
"Psr\\Http\\Message\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "PHP-FIG",
"homepage": "https://www.php-fig.org/"
}
],
"description": "PSR-17: Common interfaces for PSR-7 HTTP message factories",
"keywords": [
"factory",
"http",
"message",
"psr",
"psr-17",
"psr-7",
"request",
"response"
],
"support": {
"source": "https://github.com/php-fig/http-factory"
},
"time": "2024-04-15T12:06:14+00:00"
},
{
"name": "psr/http-message",
"version": "2.0",
"source": {
"type": "git",
"url": "https://github.com/php-fig/http-message.git",
"reference": "402d35bcb92c70c026d1a6a9883f06b2ead23d71"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/php-fig/http-message/zipball/402d35bcb92c70c026d1a6a9883f06b2ead23d71",
"reference": "402d35bcb92c70c026d1a6a9883f06b2ead23d71",
"shasum": ""
},
"require": {
"php": "^7.2 || ^8.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.0.x-dev"
}
},
"autoload": {
"psr-4": {
"Psr\\Http\\Message\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "PHP-FIG",
"homepage": "https://www.php-fig.org/"
}
],
"description": "Common interface for HTTP messages",
"homepage": "https://github.com/php-fig/http-message",
"keywords": [
"http",
"http-message",
"psr",
"psr-7",
"request",
"response"
],
"support": {
"source": "https://github.com/php-fig/http-message/tree/2.0"
},
"time": "2023-04-04T09:54:51+00:00"
},
{ {
"name": "psr/log", "name": "psr/log",
"version": "3.0.2", "version": "3.0.2",
@@ -6817,8 +7239,10 @@
"php": "^8.2", "php": "^8.2",
"ext-ctype": "*", "ext-ctype": "*",
"ext-curl": "*", "ext-curl": "*",
"ext-iconv": "*" "ext-dom": "*",
"ext-iconv": "*",
"ext-libxml": "*"
}, },
"platform-dev": {}, "platform-dev": {},
"plugin-api-version": "2.6.0" "plugin-api-version": "2.9.0"
} }

View File

@@ -40,6 +40,8 @@ final readonly class AgentRunner
{ {
$prompt = trim($prompt); $prompt = trim($prompt);
$swagFullOutPut = ''; $swagFullOutPut = '';
$firstThinkLoop = true;
$shopResults = [];
if ($prompt === '') { if ($prompt === '') {
yield '❌ Empty prompt.'; yield '❌ Empty prompt.';
@@ -61,6 +63,27 @@ final readonly class AgentRunner
yield $this->systemMsg("Ich analysiere deine Anfrage...", "think"); yield $this->systemMsg("Ich analysiere deine Anfrage...", "think");
// ---------------------------------------------------------
// 2) Extract URL content (if present)
// ---------------------------------------------------------
yield $this->systemMsg("Ich prüfe auf Internet Quellen...", "think");
$urlContent = $this->urlAnalyzer->extractContentFromPrompt($prompt);
// ---------------------------------------------------------
// 3) Retrieve RAG knowledge
// ---------------------------------------------------------
yield $this->systemMsg("Ich hole relevante Daten aus meinem RAG Wissen...", "think");
$knowledgeChunks = $this->retriever->retrieve($prompt);
// ---------------------------------------------------------
// 4) commerce/shop search
// ---------------------------------------------------------
$commerceMeta = $this->commerceIntentLite->detect($prompt);
$commerceIntent = (string)($commerceMeta['intent'] ?? CommerceIntentLite::NONE);
if ($commerceIntent === CommerceIntentLite::PRODUCT_SEARCH || $commerceIntent === CommerceIntentLite::ADVISORY_PRODUCT_SEARCH) {
//PreOptimize swag search query
$promptSwagSearch = ' $promptSwagSearch = '
Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche. Erzeuge aus dem folgenden Nutzereingabetext einen kurzen Suchtext für die Shopware-6-Suche.
@@ -78,11 +101,15 @@ final readonly class AgentRunner
Ausgabeformat: Ausgabeformat:
Keyword1 Keyword2 Keyword3 Keyword1 Keyword2 Keyword3
Text: ' . $prompt . ' Nutzereingabetext: ' . $prompt . '
'; ';
//Reset thinkSuppressor
$this->thinkSuppressor->reset(); $this->thinkSuppressor->reset();
yield $this->systemMsg("Ich optimere die Shopanfrage...", "think");
//Call ai for optimized swag query
foreach ($this->ollamaClient->stream($promptSwagSearch) as $swagToken) { foreach ($this->ollamaClient->stream($promptSwagSearch) as $swagToken) {
if (!is_string($swagToken)) { if (!is_string($swagToken)) {
@@ -98,29 +125,11 @@ final readonly class AgentRunner
$swagFullOutPut .= $swagCleanToken; $swagFullOutPut .= $swagCleanToken;
} }
yield $this->systemMsg("Ich habe folgende Keywords an die Shopsuche geschickt: " . $swagFullOutPut, "think");
// ---------------------------------------------------------
// 2) Extract URL content (if present)
// ---------------------------------------------------------
$urlContent = $this->urlAnalyzer->extractContentFromPrompt($prompt);
// ---------------------------------------------------------
// 3) Retrieve RAG knowledge
// ---------------------------------------------------------
yield $this->systemMsg("Ich hole relevante Daten aus meinem RAG Wissen...", "think");
$knowledgeChunks = $this->retriever->retrieve($prompt);
// ---------------------------------------------------------
// 4) commerce/shop search
// ---------------------------------------------------------
$commerceMeta = $this->commerceIntentLite->detect($prompt);
$commerceIntent = (string)($commerceMeta['intent'] ?? CommerceIntentLite::NONE);
yield $this->systemMsg("Ich rufe Shopdaten ab (type: " . $commerceIntent . ")", "think"); yield $this->systemMsg("Ich rufe Shopdaten ab (type: " . $commerceIntent . ")", "think");
//Search in swag by ai optimized query
$shopResults = $swagFullOutPut ? $this->shopSearchService->search($swagFullOutPut, $commerceIntent) : ''; $shopResults = $swagFullOutPut ? $this->shopSearchService->search($swagFullOutPut, $commerceIntent) : '';
}
if ($commerceIntent === CommerceIntentLite::PRODUCT_SEARCH) { if ($commerceIntent === CommerceIntentLite::PRODUCT_SEARCH) {
$knowledgeChunks = array_slice($knowledgeChunks, 0, 2); $knowledgeChunks = array_slice($knowledgeChunks, 0, 2);
@@ -128,13 +137,7 @@ final readonly class AgentRunner
$knowledgeChunks = array_slice($knowledgeChunks, 0, 3); $knowledgeChunks = array_slice($knowledgeChunks, 0, 3);
} }
if ($shopResults) { yield $this->systemMsg("Ich analysiere alle Informationen...", "think");
yield $this->systemMsg("Ich verarbeite Shopdaten...", "think");
} else {
yield $this->systemMsg("Ich habe keine releveanten Shopdaten gefunden...", "think");
}
yield $this->systemMsg("Ich analysiere gefundene Informationen...", "think");
// --------------------------------------------------------- // ---------------------------------------------------------
// 5) Build final prompt // 5) Build final prompt
@@ -179,8 +182,10 @@ final readonly class AgentRunner
$cleanToken = $this->thinkSuppressor->filter((string)$token); $cleanToken = $this->thinkSuppressor->filter((string)$token);
if ($cleanToken === '') { if ($cleanToken === '') {
if ($firstThinkLoop) {
yield $this->systemMsg("Denke nach...", "think"); yield $this->systemMsg("Denke nach...", "think");
usleep(500); $firstThinkLoop = false;
}
continue; continue;
} }

View File

@@ -4,7 +4,9 @@ declare(strict_types=1);
namespace App\Context; namespace App\Context;
use RuntimeException; use fivefilters\Readability\Configuration;
use fivefilters\Readability\ParseException;
use fivefilters\Readability\Readability;
/** /**
* UrlAnalyzer * UrlAnalyzer
@@ -65,30 +67,29 @@ final class UrlAnalyzer
], ],
]); ]);
$handle = @fopen($url, 'rb', false, $context); $html = @file_get_contents($url, false, $context);
if ($handle === false) { if ($html === false || $html === '') {
return ''; return '';
} }
$config = new Configuration();
$config->setFixRelativeURLs(true);
$config->setOriginalURL($url);
$readability = new Readability($config);
try { try {
$html = ''; $readability->parse($html);
while (!feof($handle) && strlen($html) < $this->maxChars * 2) { $content = $readability->getContent() ?? '';
$html .= fread($handle, 1024); } catch (ParseException) {
}
} finally {
fclose($handle);
}
if ($html === '') {
return ''; return '';
} }
// Remove script and style blocks if ($content === '') {
$html = preg_replace('~<script[^>]*>.*?</script>~is', '', $html) ?? $html; return '';
$html = preg_replace('~<style[^>]*>.*?</style>~is', '', $html) ?? $html; }
// Strip remaining HTML and normalize whitespace $text = strip_tags($content);
$text = strip_tags($html);
$text = preg_replace('/\s+/u', ' ', $text) ?? $text; $text = preg_replace('/\s+/u', ' ', $text) ?? $text;
return mb_substr(trim($text), 0, $this->maxChars); return mb_substr(trim($text), 0, $this->maxChars);