Skip to content
Snippets Groups Projects
Unverified Commit 39cdf61a authored by Eugen Rochko's avatar Eugen Rochko Committed by GitHub
Browse files

Add support for structured data and more OpenGraph tags to link cards (#16938)

Save preview cards under their canonical URL

Increase max redirects to follow from 2 to 3
parent 989c67d2
No related branches found
No related tags found
No related merge requests found
# frozen_string_literal: true
class LinkDetailsExtractor
include ActionView::Helpers::TagHelper
class StructuredData
def initialize(data)
@data = data
end
def headline
json['headline']
end
def description
json['description']
end
def image
obj = first_of_value(json['image'])
return obj['url'] if obj.is_a?(Hash)
obj
end
def date_published
json['datePublished']
end
def date_modified
json['dateModified']
end
def author_name
author['name']
end
def author_url
author['url']
end
def publisher_name
publisher['name']
end
private
def author
first_of_value(json['author']) || {}
end
def publisher
first_of_value(json['publisher']) || {}
end
def first_of_value(arr)
arr.is_a?(Array) ? arr.first : arr
end
def json
@json ||= Oj.load(@data)
end
end
def initialize(original_url, html, html_charset)
@original_url = Addressable::URI.parse(original_url)
@html = html
@html_charset = html_charset
end
def to_preview_card_attributes
{
title: title || '',
description: description || '',
image_remote_url: image,
type: type,
width: width || 0,
height: height || 0,
html: html || '',
provider_name: provider_name || '',
provider_url: provider_url || '',
author_name: author_name || '',
author_url: author_url || '',
embed_url: embed_url || '',
}
end
def type
player_url.present? ? :video : :link
end
def html
player_url.present? ? content_tag(:iframe, src: player_url, width: width, height: height, allowtransparency: 'true', scrolling: 'no', frameborder: '0') : nil
end
def width
opengraph_tag('twitter:player:width')
end
def height
opengraph_tag('twitter:player:height')
end
def title
structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first
end
def description
structured_data&.description || opengraph_tag('og:description') || meta_tag('description')
end
def image
valid_url_or_nil(opengraph_tag('og:image'))
end
def canonical_url
valid_url_or_nil(opengraph_tag('og:url') || link_tag('canonical'), same_origin_only: true) || @original_url.to_s
end
def provider_name
structured_data&.publisher_name || opengraph_tag('og:site_name')
end
def provider_url
valid_url_or_nil(host_to_url(opengraph_tag('og:site')))
end
def author_name
structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username')
end
def author_url
structured_data&.author_url
end
def embed_url
valid_url_or_nil(opengraph_tag('twitter:player:stream'))
end
private
def player_url
valid_url_or_nil(opengraph_tag('twitter:player'))
end
def host_to_url(str)
return if str.blank?
str.start_with?(/https?:\/\//) ? str : "http://#{str}"
end
def valid_url_or_nil(str, same_origin_only: false)
return if str.blank?
url = @original_url + Addressable::URI.parse(str)
return if url.host.blank? || !%w(http https).include?(url.scheme) || (same_origin_only && url.host != @original_url.host)
url.to_s
rescue Addressable::URI::InvalidURIError
nil
end
def link_tag(name)
document.xpath("//link[@rel=\"#{name}\"]").map { |link| link['href'] }.first
end
def opengraph_tag(name)
document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").map { |meta| meta['content'] }.first
end
def meta_tag(name)
document.xpath("//meta[@name=\"#{name}\"]").map { |meta| meta['content'] }.first
end
def structured_data
@structured_data ||= begin
json_ld = document.xpath('//script[@type="application/ld+json"]').map(&:content).first
json_ld.present? ? StructuredData.new(json_ld) : nil
end
end
def document
@document ||= Nokogiri::HTML(@html, nil, encoding)
end
def encoding
@encoding ||= begin
guess = detector.detect(@html, @html_charset)
guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
end
end
def detector
@detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector|
detector.strip_tags = true
end
end
end
......@@ -94,7 +94,7 @@ class Request
end
def http_client
HTTP.use(:auto_inflate).timeout(TIMEOUT.dup).follow(max_hops: 2)
HTTP.use(:auto_inflate).timeout(TIMEOUT.dup).follow(max_hops: 3)
end
end
......
......@@ -13,12 +13,12 @@ class FetchLinkCardService < BaseService
}iox
def call(status)
@status = status
@url = parse_urls
@status = status
@original_url = parse_urls
return if @url.nil? || @status.preview_cards.any?
return if @original_url.nil? || @status.preview_cards.any?
@url = @url.to_s
@url = @original_url.to_s
RedisLock.acquire(lock_options) do |lock|
if lock.acquired?
......@@ -31,7 +31,7 @@ class FetchLinkCardService < BaseService
attach_card if @card&.persisted?
rescue HTTP::Error, OpenSSL::SSL::SSLError, Addressable::URI::InvalidURIError, Mastodon::HostValidationError, Mastodon::LengthValidationError => e
Rails.logger.debug "Error fetching link #{@url}: #{e}"
Rails.logger.debug "Error fetching link #{@original_url}: #{e}"
nil
end
......@@ -47,6 +47,12 @@ class FetchLinkCardService < BaseService
return @html if defined?(@html)
Request.new(:get, @url).add_headers('Accept' => 'text/html', 'User-Agent' => Mastodon::Version.user_agent + ' Bot').perform do |res|
# We follow redirects, and ideally we want to save the preview card for
# the destination URL and not any link shortener in-between, so here
# we set the URL to the one of the last response in the redirect chain
@url = res.request.uri.to_s.to_s
@card = PreviewCard.find_or_initialize_by(url: @url) if @card.url != @url
if res.code == 200 && res.mime_type == 'text/html'
@html_charset = res.charset
@html = res.body_with_limit
......@@ -63,12 +69,15 @@ class FetchLinkCardService < BaseService
end
def parse_urls
if @status.local?
urls = @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[1]).normalize }
else
html = Nokogiri::HTML(@status.text)
links = html.css('a')
urls = links.filter_map { |a| Addressable::URI.parse(a['href']) unless skip_link?(a) }.filter_map(&:normalize)
urls = begin
if @status.local?
@status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[1]).normalize }
else
document = Nokogiri::HTML(@status.text)
links = document.css('a')
links.filter_map { |a| Addressable::URI.parse(a['href']) unless skip_link?(a) }.filter_map(&:normalize)
end
end
urls.reject { |uri| bad_url?(uri) }.first
......@@ -79,18 +88,16 @@ class FetchLinkCardService < BaseService
uri.host.blank? || TagManager.instance.local_url?(uri.to_s) || !%w(http https).include?(uri.scheme)
end
# rubocop:disable Naming/MethodParameterName
def mention_link?(a)
def mention_link?(anchor)
@status.mentions.any? do |mention|
a['href'] == ActivityPub::TagManager.instance.url_for(mention.account)
anchor['href'] == ActivityPub::TagManager.instance.url_for(mention.account)
end
end
def skip_link?(a)
def skip_link?(anchor)
# Avoid links for hashtags and mentions (microformats)
a['rel']&.include?('tag') || a['class']&.match?(/u-url|h-card/) || mention_link?(a)
anchor['rel']&.include?('tag') || anchor['class']&.match?(/u-url|h-card/) || mention_link?(anchor)
end
# rubocop:enable Naming/MethodParameterName
def attempt_oembed
service = FetchOEmbedService.new
......@@ -139,42 +146,14 @@ class FetchLinkCardService < BaseService
def attempt_opengraph
return if html.nil?
detector = CharlockHolmes::EncodingDetector.new
detector.strip_tags = true
guess = detector.detect(@html, @html_charset)
encoding = guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
page = Nokogiri::HTML(@html, nil, encoding)
player_url = meta_property(page, 'twitter:player')
if player_url && !bad_url?(Addressable::URI.parse(player_url))
@card.type = :video
@card.width = meta_property(page, 'twitter:player:width') || 0
@card.height = meta_property(page, 'twitter:player:height') || 0
@card.html = content_tag(:iframe, nil, src: player_url,
width: @card.width,
height: @card.height,
allowtransparency: 'true',
scrolling: 'no',
frameborder: '0')
else
@card.type = :link
end
@card.title = meta_property(page, 'og:title').presence || page.at_xpath('//title')&.content || ''
@card.description = meta_property(page, 'og:description').presence || meta_property(page, 'description') || ''
@card.image_remote_url = (Addressable::URI.parse(@url) + meta_property(page, 'og:image')).to_s if meta_property(page, 'og:image')
return if @card.title.blank? && @card.html.blank?
@card.save_with_optional_image!
end
link_details_extractor = LinkDetailsExtractor.new(@url, @html, @html_charset)
def meta_property(page, property)
page.at_xpath("//meta[contains(concat(' ', normalize-space(@property), ' '), ' #{property} ')]")&.attribute('content')&.value || page.at_xpath("//meta[@name=\"#{property}\"]")&.attribute('content')&.value
@card = PreviewCard.find_or_initialize_by(url: link_details_extractor.canonical_url) if link_details_extractor.canonical_url != @card.url
@card.assign_attributes(link_details_extractor.to_preview_card_attributes)
@card.save_with_optional_image! unless @card.title.blank? && @card.html.blank?
end
def lock_options
{ redis: Redis.current, key: "fetch:#{@url}", autorelease: 15.minutes.seconds }
{ redis: Redis.current, key: "fetch:#{@original_url}", autorelease: 15.minutes.seconds }
end
end
require 'rails_helper'
RSpec.describe LinkDetailsExtractor do
let(:original_url) { '' }
let(:html) { '' }
let(:html_charset) { nil }
subject { described_class.new(original_url, html, html_charset) }
describe '#canonical_url' do
let(:original_url) { 'https://foo.com/article?bar=baz123' }
context 'when canonical URL points to another host' do
let(:html) { '<!doctype html><link rel="canonical" href="https://bar.com/different-article" />' }
it 'ignores the canonical URLs' do
expect(subject.canonical_url).to eq original_url
end
end
context 'when canonical URL points to the same host' do
let(:html) { '<!doctype html><link rel="canonical" href="https://foo.com/article" />' }
it 'ignores the canonical URLs' do
expect(subject.canonical_url).to eq 'https://foo.com/article'
end
end
end
end
require 'rails_helper'
RSpec.describe FetchLinkCardService, type: :service do
subject { FetchLinkCardService.new }
subject { described_class.new }
before do
stub_request(:get, 'http://example.xn--fiqs8s/').to_return(request_fixture('idn.txt'))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment