From 211d5c3c300b5a54b60c7b0142158144f9b0d392 Mon Sep 17 00:00:00 2001
From: Claire <claire.github-309c@sitedethib.com>
Date: Thu, 15 Jul 2021 15:56:58 +0200
Subject: [PATCH] Fix inefficiencies in auto-linking code (#16506)

The auto-linking code basically rewrote the whole string escaping non-ascii
characters in an inefficient way, and building a full character offset map
between the unescaped and escaped texts before sending the contents to
TwitterText's extractor.

Instead of doing that, this commit changes the TwitterText regexps to include
valid IRI characters in addition to valid URI characters.
---
 app/lib/formatter.rb                 | 31 +---------------------------
 config/initializers/twitter_regex.rb |  4 ++++
 2 files changed, 5 insertions(+), 30 deletions(-)

diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb
index fd65375262..1610f36892 100644
--- a/app/lib/formatter.rb
+++ b/app/lib/formatter.rb
@@ -214,39 +214,10 @@ class Formatter
     result.flatten.join
   end
 
-  UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/
-
   def utf8_friendly_extractor(text, options = {})
-    old_to_new_index = [0]
-
-    escaped = text.chars.map do |c|
-      output = begin
-        if c.ord.to_s(16).length > 2 && !UNICODE_ESCAPE_BLACKLIST_RE.match?(c)
-          CGI.escape(c)
-        else
-          c
-        end
-      end
-
-      old_to_new_index << old_to_new_index.last + output.length
-
-      output
-    end.join
-
     # Note: I couldn't obtain list_slug with @user/list-name format
     # for mention so this requires additional check
-    special = Extractor.extract_urls_with_indices(escaped, options).map do |extract|
-      new_indices = [
-        old_to_new_index.find_index(extract[:indices].first),
-        old_to_new_index.find_index(extract[:indices].last),
-      ]
-
-      next extract.merge(
-        indices: new_indices,
-        url: text[new_indices.first..new_indices.last - 1]
-      )
-    end
-
+    special = Extractor.extract_urls_with_indices(text, options)
     standard = Extractor.extract_entities_with_indices(text, options)
     extra = Extractor.extract_extra_uris_with_indices(text, options)
 
diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb
index 3ff2aa9e5d..84c09ff35c 100644
--- a/config/initializers/twitter_regex.rb
+++ b/config/initializers/twitter_regex.rb
@@ -24,6 +24,10 @@ module Twitter::TwitterText
         )
       \)
     /iox
+    REGEXEN[:valid_iri_ucschar] = /[\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/iou
+    REGEXEN[:valid_iri_iprivate] = /[\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}]/iou
+    REGEXEN[:valid_url_query_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/iou
+    REGEXEN[:valid_url_query_ending_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9_&=#\/\-]/iou
     REGEXEN[:valid_url_path] = /(?:
       (?:
         #{REGEXEN[:valid_general_url_path_chars]}*
-- 
GitLab