From ad207456d64c76d21c17b26a954b459fe2dc0f54 Mon Sep 17 00:00:00 2001
From: "Renato \"Lond\" Cerqueira" <renato@lond.com.br>
Date: Thu, 16 Nov 2017 10:51:38 -0200
Subject: [PATCH] Improve language filter (#5724)

* Scrub text of html before detecting language.

* Detect language on statuses coming from activitypub.

* Fix rubocop comments.

* Remove custom emoji from text before language detection
---
 app/lib/activitypub/activity/create.rb |  2 +-
 app/lib/language_detector.rb           | 31 +++++++++++++++++++++-----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb
index 376684c006..66e4f7c5ec 100644
--- a/app/lib/activitypub/activity/create.rb
+++ b/app/lib/activitypub/activity/create.rb
@@ -173,7 +173,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
   end
 
   def language_from_content
-    return nil unless language_map?
+    return LanguageDetector.instance.detect(text_from_content, @account) unless language_map?
     @object['contentMap'].keys.first
   end
 
diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb
index a42460e109..c6f52f0c7e 100644
--- a/app/lib/language_detector.rb
+++ b/app/lib/language_detector.rb
@@ -38,12 +38,31 @@ class LanguageDetector
   end
 
   def simplify_text(text)
-    text.dup.tap do |new_text|
-      new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
-      new_text.gsub!(Account::MENTION_RE, '')
-      new_text.gsub!(Tag::HASHTAG_RE, '')
-      new_text.gsub!(/\s+/, ' ')
-    end
+    new_text = remove_html(text)
+    new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
+    new_text.gsub!(Account::MENTION_RE, '')
+    new_text.gsub!(Tag::HASHTAG_RE, '')
+    new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '')
+    new_text.gsub!(/\s+/, ' ')
+    new_text
+  end
+
+  def new_scrubber
+    scrubber = Rails::Html::PermitScrubber.new
+    scrubber.tags = %w(br p)
+    scrubber
+  end
+
+  def scrubber
+    @scrubber ||= new_scrubber
+  end
+
+  def remove_html(text)
+    text = Loofah.fragment(text).scrub!(scrubber).to_s
+    text.gsub!('<br>', "\n")
+    text.gsub!('</p><p>', "\n\n")
+    text.gsub!(/(^<p>|<\/p>$)/, '')
+    text
   end
 
   def default_locale(account)
-- 
GitLab