Use possessive quantifiers to optimise regex matching performance

See 'ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS' section at:
https://pcre.org/pcre.txt

Thanks to Giuseppe D'Angelo for pointing that out.
wilder
Ahmad Samir 4 years ago committed by Kurt Hindenburg
parent 3b7e73f54e
commit 40b1fcf487
  1. 68
      src/autotests/HotSpotFilterTest.cpp
  2. 45
      src/filterHotSpots/UrlFilter.cpp

@ -12,35 +12,71 @@ QTEST_GUILESS_MAIN(HotSpotFilterTest)
void HotSpotFilterTest::testUrlFilterRegex_data() void HotSpotFilterTest::testUrlFilterRegex_data()
{ {
QTest::addColumn<QString>("url"); QTest::addColumn<QString>("url");
QTest::addColumn<QString>("expectedUrl");
QTest::addColumn<bool>("matchResult"); QTest::addColumn<bool>("matchResult");
// A space, \n, or \t before the url to match what happens at runtime, // A space, \n, or \t before the url to match what happens at runtime,
// i.e. to match "http" but not "foohttp" // i.e. to match "http" but not "foohttp"
QTest::newRow("url_simple") << " https://api.kde.org" << true; QTest::newRow("url_simple") << " https://api.kde.org"
QTest::newRow("url_with_port") << "\nhttps://api.kde.org:2098" << true; << "https://api.kde.org" << true;
QTest::newRow("url_with_path") << "https://api.kde.org/path/to/somewhere" << true; QTest::newRow("url_with_port") << "\nhttps://api.kde.org:2098"
QTest::newRow("url_with_query") << "https://user:pass@api.kde.org?somequery=foo" << true; << "https://api.kde.org:2098" << true;
QTest::newRow("url_with_port_path") << " https://api.kde.org:2098/path/to/somewhere" << true; QTest::newRow("url_with_path") << "https://api.kde.org/path/to/somewhere"
QTest::newRow("url_with_user_password") << "\thttps://user:blah@api.kde.org" << true; << "https://api.kde.org/path/to/somewhere" << true;
QTest::newRow("url_with_user_password_port_fragment") << " https://user:blah@api.kde.org:2098#fragment" << true; QTest::newRow("url_with_query") << "https://user:pass@api.kde.org?somequery=foo"
QTest::newRow("url_all_bells") << " https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment" << true; << "https://user:pass@api.kde.org?somequery=foo" << true;
QTest::newRow("uppercase") << " https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" << true; QTest::newRow("url_with_port_path") << " https://api.kde.org:2098/path/to/somewhere"
QTest::newRow("markup") << " [https://foobar](https://foobar)" << true; << "https://api.kde.org:2098/path/to/somewhere" << true;
QTest::newRow("url_with_user_password") << "\thttps://user:blah@api.kde.org"
QTest::newRow("bad_url_no_scheme") << QStringLiteral(" www.kde.org") << false; << "https://user:blah@api.kde.org" << true;
QTest::newRow("url_with_user_password_port_fragment") << " https://user:blah@api.kde.org:2098#fragment"
<< "https://user:blah@api.kde.org:2098#fragment" << true;
QTest::newRow("url_all_bells") << " https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment"
<< "https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment" << true;
QTest::newRow("uppercase") << " https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md"
<< "https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" << true;
QTest::newRow("markup") << " [https://foobar](https://foobar)"
<< "https://foobar" << true;
QTest::newRow("bracket_before") << "[198]http://www.ietf.org/rfc/rfc2396.txt"
<< "http://www.ietf.org/rfc/rfc2396.txt" << true;
QTest::newRow("quote_before") << "\"http://www.ietf.org/rfc/rfc2396.txt"
<< "http://www.ietf.org/rfc/rfc2396.txt" << true;
QTest::newRow("file_scheme") << "file:///some/file"
<< "file:///some/file" << true;
QTest::newRow("uppercase_host") << "https://EXAMPLE.com"
<< "https://EXAMPLE.com" << true;
QTest::newRow("uppercase_query") << "https://example.com?fooOpt=barVal"
<< "https://example.com?fooOpt=barVal" << true;
QTest::newRow("uppercase_fragment") << "https://example.com?fooOpt=barVal#FRAG"
<< "https://example.com?fooOpt=barVal#FRAG" << true;
QTest::newRow("www") << " www.kde.org"
<< "www.kde.org" << true;
QTest::newRow("with_comma_in_path") << "https://example.com/foo,bar"
<< "https://example.com/foo,bar" << true;
QTest::newRow("empty_query") << "http://example.com/?"
<< "http://example.com" << true;
QTest::newRow("empty_fragment") << "http://example.com/#"
<< "http://example.com" << true;
} }
void HotSpotFilterTest::testUrlFilterRegex() void HotSpotFilterTest::testUrlFilterRegex()
{ {
QFETCH(QString, url); QFETCH(QString, url);
QFETCH(QString, expectedUrl);
QFETCH(bool, matchResult); QFETCH(bool, matchResult);
const QRegularExpression &regex = Konsole::UrlFilter::FullUrlRegExp; const QRegularExpression &regex = Konsole::UrlFilter::FullUrlRegExp;
const QRegularExpressionMatch match = regex.match(url); const QRegularExpressionMatch match = regex.match(url);
// qDebug() << match;
QCOMPARE(match.hasMatch(), matchResult); QCOMPARE(match.hasMatch(), matchResult);
if (strcmp(QTest::currentDataTag(), "markup") == 0) { if (matchResult) {
QCOMPARE(match.capturedView(0), u"https://foobar"); QCOMPARE(match.capturedView(0), expectedUrl);
} else if (matchResult) {
QCOMPARE(match.capturedView(0), url.trimmed());
} }
} }

@ -19,11 +19,17 @@ using namespace Konsole;
// FullUrlRegExp is implemented based on: // FullUrlRegExp is implemented based on:
// https://datatracker.ietf.org/doc/html/rfc3986 // https://datatracker.ietf.org/doc/html/rfc3986
// See above URL for what "unreserved", "pct-encoded" ...etc mean, also // See above URL for what "unreserved", "pct-encoded" ...etc mean, also
// for the regex used for each part of the url being matched against // for the regex used for each part of the url being matched against.
//
// unreserved / pct-encoded / sub-delims // It deviates from rfc3986:
// [a-z0-9\\-._~%!$&'()*+,;=] // - We only recognize URIs with authority (even if it is an empty authority)
// The above string is used in various char[] below // - We match URIs starting with 'www.'
// - "userinfo" is assumed to have a single ':' character
// - We _don't_ match IPv6 addresses (e.g. http://[2010:836B:4179::836B:4179])
// or IPvFuture
// - "port" (':1234'), if present, is assumed to be non-empty
// - We don't check the validity of percent-encoded characters
// (e.g. "www.example.com/foo%XXbar")
// All () groups are non-capturing (by using "(?:)" notation) // All () groups are non-capturing (by using "(?:)" notation)
// less bookkeeping on the PCRE engine side // less bookkeeping on the PCRE engine side
@ -31,32 +37,37 @@ using namespace Konsole;
// scheme:// // scheme://
// - Must start with an ASCII letter, preceeded by any non-word character, // - Must start with an ASCII letter, preceeded by any non-word character,
// so "http" but not "mhttp" // so "http" but not "mhttp"
static const char scheme[] = "(?<=^|\\s|\\W)(?:[a-z][a-z0-9+\\-.]*://)"; static const char scheme_or_www[] = "(?<=^|[\\s\\[\\]()'\"])(?:www\\.|[a-z][a-z0-9+\\-.]*+://)";
// unreserved / pct-encoded / sub-delims
#define COMMON_1 "a-z0-9\\-._~%!$&'()*+,;="
/* clang-format off */
// user:password@ // user:password@
static const char userInfo[] = static const char userInfo[] =
"(?:" "(?:"
"[a-z0-9\\-._~%!$&'()*+,;=]+?:?" "[" COMMON_1 "]+?:?"
"[a-z0-9\\-._~%!$&'()*+,;=]+@" "[" COMMON_1 "]++@"
")?"; ")?+";
static const char host[] = "(?:[a-z0-9\\-._~%!$&'()*+,;=]+)"; // www.foo.bar static const char host[] = "(?:[" COMMON_1 "]*+)"; // www.foo.bar
static const char port[] = "(?::[0-9]+)?"; // :1234 static const char port[] = "(?::[0-9]+)?+"; // :1234
static const char path[] = "(?:[a-zA-Z0-9\\-._~%!$&'()*+,;=:@/]+)?"; // /path/to/some/place
static const char query[] = "(?:\\?[a-z0-9\\-._~%!$&'()*+,;=:@/]+)?"; // "?somequery=bar" #define COMMON_2 "a-z0-9\\-._~%!$&'()*+,;=:@/"
static const char fragment[] = "(?:#[a-z0-9/?]+)?"; static const char path[] = "(?:/[" COMMON_2 "]+)?+"; // /path/to/some/place
static const char query[] = "(?:\\?[" COMMON_2 "]+)?+"; // "?somequery=bar"
static const char fragment[] = "(?:#[" COMMON_2 "]+)?+"; // "#fragment"
using LS1 = QLatin1String; using LS1 = QLatin1String;
/* clang-format off */
const QRegularExpression UrlFilter::FullUrlRegExp( const QRegularExpression UrlFilter::FullUrlRegExp(
LS1(scheme) LS1(scheme_or_www)
+ LS1(userInfo) + LS1(userInfo)
+ LS1(host) + LS1(host)
+ LS1(port) + LS1(port)
+ LS1(path) + LS1(path)
+ LS1(query) + LS1(query)
+ LS1(fragment) + LS1(fragment)
); , QRegularExpression::CaseInsensitiveOption);
/* clang-format on */ /* clang-format on */
///////////////////////////////////////////// /////////////////////////////////////////////

Loading…
Cancel
Save