From 40b1fcf4872b7d41eff3386db787bfc894c8e01b Mon Sep 17 00:00:00 2001 From: Ahmad Samir Date: Mon, 2 May 2022 17:49:26 +0200 Subject: [PATCH] Use possessive quantifiers to optimise regex matching performance See 'ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS' section at: https://pcre.org/pcre.txt Thanks to Giuseppe D'Angelo for pointing that out. --- src/autotests/HotSpotFilterTest.cpp | 68 ++++++++++++++++++++++------- src/filterHotSpots/UrlFilter.cpp | 45 +++++++++++-------- 2 files changed, 80 insertions(+), 33 deletions(-) diff --git a/src/autotests/HotSpotFilterTest.cpp b/src/autotests/HotSpotFilterTest.cpp index 43ae68d3..d85f1932 100644 --- a/src/autotests/HotSpotFilterTest.cpp +++ b/src/autotests/HotSpotFilterTest.cpp @@ -12,35 +12,71 @@ QTEST_GUILESS_MAIN(HotSpotFilterTest) void HotSpotFilterTest::testUrlFilterRegex_data() { QTest::addColumn("url"); + QTest::addColumn("expectedUrl"); QTest::addColumn("matchResult"); // A space, \n, or \t before the url to match what happens at runtime, // i.e. to match "http" but not "foohttp" - QTest::newRow("url_simple") << " https://api.kde.org" << true; - QTest::newRow("url_with_port") << "\nhttps://api.kde.org:2098" << true; - QTest::newRow("url_with_path") << "https://api.kde.org/path/to/somewhere" << true; - QTest::newRow("url_with_query") << "https://user:pass@api.kde.org?somequery=foo" << true; - QTest::newRow("url_with_port_path") << " https://api.kde.org:2098/path/to/somewhere" << true; - QTest::newRow("url_with_user_password") << "\thttps://user:blah@api.kde.org" << true; - QTest::newRow("url_with_user_password_port_fragment") << " https://user:blah@api.kde.org:2098#fragment" << true; - QTest::newRow("url_all_bells") << " https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment" << true; - QTest::newRow("uppercase") << " https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" << true; - QTest::newRow("markup") << " [https://foobar](https://foobar)" << true; - - QTest::newRow("bad_url_no_scheme") << QStringLiteral(" www.kde.org") << false; + QTest::newRow("url_simple") << " https://api.kde.org" + << "https://api.kde.org" << true; + QTest::newRow("url_with_port") << "\nhttps://api.kde.org:2098" + << "https://api.kde.org:2098" << true; + QTest::newRow("url_with_path") << "https://api.kde.org/path/to/somewhere" + << "https://api.kde.org/path/to/somewhere" << true; + QTest::newRow("url_with_query") << "https://user:pass@api.kde.org?somequery=foo" + << "https://user:pass@api.kde.org?somequery=foo" << true; + QTest::newRow("url_with_port_path") << " https://api.kde.org:2098/path/to/somewhere" + << "https://api.kde.org:2098/path/to/somewhere" << true; + QTest::newRow("url_with_user_password") << "\thttps://user:blah@api.kde.org" + << "https://user:blah@api.kde.org" << true; + QTest::newRow("url_with_user_password_port_fragment") << " https://user:blah@api.kde.org:2098#fragment" + << "https://user:blah@api.kde.org:2098#fragment" << true; + QTest::newRow("url_all_bells") << " https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment" + << "https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment" << true; + QTest::newRow("uppercase") << " https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" + << "https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" << true; + QTest::newRow("markup") << " [https://foobar](https://foobar)" + << "https://foobar" << true; + + QTest::newRow("bracket_before") << "[198]http://www.ietf.org/rfc/rfc2396.txt" + << "http://www.ietf.org/rfc/rfc2396.txt" << true; + QTest::newRow("quote_before") << "\"http://www.ietf.org/rfc/rfc2396.txt" + << "http://www.ietf.org/rfc/rfc2396.txt" << true; + + QTest::newRow("file_scheme") << "file:///some/file" + << "file:///some/file" << true; + + QTest::newRow("uppercase_host") << "https://EXAMPLE.com" + << "https://EXAMPLE.com" << true; + QTest::newRow("uppercase_query") << "https://example.com?fooOpt=barVal" + << "https://example.com?fooOpt=barVal" << true; + QTest::newRow("uppercase_fragment") << "https://example.com?fooOpt=barVal#FRAG" + << "https://example.com?fooOpt=barVal#FRAG" << true; + + QTest::newRow("www") << " www.kde.org" + << "www.kde.org" << true; + + QTest::newRow("with_comma_in_path") << "https://example.com/foo,bar" + << "https://example.com/foo,bar" << true; + + QTest::newRow("empty_query") << "http://example.com/?" + << "http://example.com" << true; + QTest::newRow("empty_fragment") << "http://example.com/#" + << "http://example.com" << true; } void HotSpotFilterTest::testUrlFilterRegex() { QFETCH(QString, url); + QFETCH(QString, expectedUrl); QFETCH(bool, matchResult); const QRegularExpression ®ex = Konsole::UrlFilter::FullUrlRegExp; const QRegularExpressionMatch match = regex.match(url); + // qDebug() << match; + QCOMPARE(match.hasMatch(), matchResult); - if (strcmp(QTest::currentDataTag(), "markup") == 0) { - QCOMPARE(match.capturedView(0), u"https://foobar"); - } else if (matchResult) { - QCOMPARE(match.capturedView(0), url.trimmed()); + if (matchResult) { + QCOMPARE(match.capturedView(0), expectedUrl); } } diff --git a/src/filterHotSpots/UrlFilter.cpp b/src/filterHotSpots/UrlFilter.cpp index c3a1c039..ef58b86e 100644 --- a/src/filterHotSpots/UrlFilter.cpp +++ b/src/filterHotSpots/UrlFilter.cpp @@ -19,11 +19,17 @@ using namespace Konsole; // FullUrlRegExp is implemented based on: // https://datatracker.ietf.org/doc/html/rfc3986 // See above URL for what "unreserved", "pct-encoded" ...etc mean, also -// for the regex used for each part of the url being matched against - -// unreserved / pct-encoded / sub-delims -// [a-z0-9\\-._~%!$&'()*+,;=] -// The above string is used in various char[] below +// for the regex used for each part of the url being matched against. +// +// It deviates from rfc3986: +// - We only recognize URIs with authority (even if it is an empty authority) +// - We match URIs starting with 'www.' +// - "userinfo" is assumed to have a single ':' character +// - We _don't_ match IPv6 addresses (e.g. http://[2010:836B:4179::836B:4179]) +// or IPvFuture +// - "port" (':1234'), if present, is assumed to be non-empty +// - We don't check the validity of percent-encoded characters +// (e.g. "www.example.com/foo%XXbar") // All () groups are non-capturing (by using "(?:)" notation) // less bookkeeping on the PCRE engine side @@ -31,32 +37,37 @@ using namespace Konsole; // scheme:// // - Must start with an ASCII letter, preceeded by any non-word character, // so "http" but not "mhttp" -static const char scheme[] = "(?<=^|\\s|\\W)(?:[a-z][a-z0-9+\\-.]*://)"; +static const char scheme_or_www[] = "(?<=^|[\\s\\[\\]()'\"])(?:www\\.|[a-z][a-z0-9+\\-.]*+://)"; + +// unreserved / pct-encoded / sub-delims +#define COMMON_1 "a-z0-9\\-._~%!$&'()*+,;=" +/* clang-format off */ // user:password@ static const char userInfo[] = "(?:" - "[a-z0-9\\-._~%!$&'()*+,;=]+?:?" - "[a-z0-9\\-._~%!$&'()*+,;=]+@" - ")?"; -static const char host[] = "(?:[a-z0-9\\-._~%!$&'()*+,;=]+)"; // www.foo.bar -static const char port[] = "(?::[0-9]+)?"; // :1234 -static const char path[] = "(?:[a-zA-Z0-9\\-._~%!$&'()*+,;=:@/]+)?"; // /path/to/some/place -static const char query[] = "(?:\\?[a-z0-9\\-._~%!$&'()*+,;=:@/]+)?"; // "?somequery=bar" -static const char fragment[] = "(?:#[a-z0-9/?]+)?"; + "[" COMMON_1 "]+?:?" + "[" COMMON_1 "]++@" + ")?+"; +static const char host[] = "(?:[" COMMON_1 "]*+)"; // www.foo.bar +static const char port[] = "(?::[0-9]+)?+"; // :1234 + +#define COMMON_2 "a-z0-9\\-._~%!$&'()*+,;=:@/" +static const char path[] = "(?:/[" COMMON_2 "]+)?+"; // /path/to/some/place +static const char query[] = "(?:\\?[" COMMON_2 "]+)?+"; // "?somequery=bar" +static const char fragment[] = "(?:#[" COMMON_2 "]+)?+"; // "#fragment" using LS1 = QLatin1String; -/* clang-format off */ const QRegularExpression UrlFilter::FullUrlRegExp( - LS1(scheme) + LS1(scheme_or_www) + LS1(userInfo) + LS1(host) + LS1(port) + LS1(path) + LS1(query) + LS1(fragment) - ); + , QRegularExpression::CaseInsensitiveOption); /* clang-format on */ /////////////////////////////////////////////