Skip to content
Snippets Groups Projects
Unverified Commit f1f6ddd5 authored by Eugen Rochko's avatar Eugen Rochko Committed by GitHub
Browse files

Fix structured data parsing from links choking on bad data (#17403)

* Fix structured data parsing from links choking on bad data

- Fix og:url meta tag being prioritized over canonical link tag
- Fix structured data parsing choking on commented-out CDATA declarations
- Fix HTML entities in title, description, provider_name, author_name
- Change structured data parsing to attempt every JSON-LD script tag

* Remove unnecessary slash escapes from CDATA regex pattern
parent 73a78239
No related branches found
No related tags found
No related merge requests found
......@@ -3,6 +3,19 @@
class LinkDetailsExtractor
include ActionView::Helpers::TagHelper
# Some publications wrap their JSON-LD data in their <script> tags
# in commented-out CDATA blocks, they need to be removed before
# attempting to parse JSON
CDATA_JUNK_PATTERN = %r{^[\s]*(
(/\*[\s]*<!\[CDATA\[[\s]*\*/) # Block comment style opening
|
(//[\s]*<!\[CDATA\[) # Single-line comment style opening
|
(/\*[\s]*\]\]>[\s]*\*/) # Block comment style closing
|
(//[\s]*\]\]>) # Single-line comment style closing
)[\s]*$}x
class StructuredData
SUPPORTED_TYPES = %w(
NewsArticle
......@@ -61,6 +74,10 @@ class LinkDetailsExtractor
publisher.dig('logo', 'url')
end
def valid?
json.present?
end
private
def author
......@@ -134,11 +151,11 @@ class LinkDetailsExtractor
end
def title
structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first
html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)
end
def description
structured_data&.description || opengraph_tag('og:description') || meta_tag('description')
html_entities.decode(structured_data&.description || opengraph_tag('og:description') || meta_tag('description'))
end
def image
......@@ -146,11 +163,11 @@ class LinkDetailsExtractor
end
def canonical_url
valid_url_or_nil(opengraph_tag('og:url') || link_tag('canonical'), same_origin_only: true) || @original_url.to_s
valid_url_or_nil(link_tag('canonical') || opengraph_tag('og:url'), same_origin_only: true) || @original_url.to_s
end
def provider_name
structured_data&.publisher_name || opengraph_tag('og:site_name')
html_entities.decode(structured_data&.publisher_name || opengraph_tag('og:site_name'))
end
def provider_url
......@@ -158,7 +175,7 @@ class LinkDetailsExtractor
end
def author_name
structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username')
html_entities.decode(structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username'))
end
def author_url
......@@ -223,10 +240,24 @@ class LinkDetailsExtractor
def structured_data
@structured_data ||= begin
json_ld = document.xpath('//script[@type="application/ld+json"]').map(&:content).first
json_ld.present? ? StructuredData.new(json_ld) : nil
rescue Oj::ParseError
nil
# Some publications have more than one JSON-LD definition on the page,
# and some of those definitions aren't valid JSON either, so we have
# to loop through here until we find something that is the right type
# and doesn't break
document.xpath('//script[@type="application/ld+json"]').filter_map do |element|
json_ld = element.content&.gsub(CDATA_JUNK_PATTERN, '')
next if json_ld.blank?
structured_data = StructuredData.new(html_entities.decode(json_ld))
next unless structured_data.valid?
structured_data
rescue Oj::ParseError, EncodingError
Rails.logger.debug("Invalid JSON-LD in #{@original_url}")
next
end.first
end
end
......@@ -246,4 +277,8 @@ class LinkDetailsExtractor
detector.strip_tags = true
end
end
def html_entities
@html_entities ||= HTMLEntities.new
end
end
......@@ -26,4 +26,126 @@ RSpec.describe LinkDetailsExtractor do
end
end
end
context 'when structured data is present' do
let(:original_url) { 'https://example.com/page.html' }
context 'and is wrapped in CDATA tags' do
let(:html) { <<-HTML }
<!doctype html>
<html>
<head>
<script type="application/ld+json">
//<![CDATA[
{"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":"https://example.com/page.html","headline":"Foo","datePublished":"2022-01-31T19:53:00+00:00","url":"https://example.com/page.html","description":"Bar","author":{"@type":"Person","name":"Hoge"},"publisher":{"@type":"Organization","name":"Baz"}}
//]]>
</script>
</head>
</html>
HTML
describe '#title' do
it 'returns the title from structured data' do
expect(subject.title).to eq 'Foo'
end
end
describe '#description' do
it 'returns the description from structured data' do
expect(subject.description).to eq 'Bar'
end
end
describe '#provider_name' do
it 'returns the provider name from structured data' do
expect(subject.provider_name).to eq 'Baz'
end
end
describe '#author_name' do
it 'returns the author name from structured data' do
expect(subject.author_name).to eq 'Hoge'
end
end
end
context 'but the first tag is invalid JSON' do
let(:html) { <<-HTML }
<!doctype html>
<html>
<body>
<script type="application/ld+json">
{
"@context":"https://schema.org",
"@type":"ItemList",
"url":"https://example.com/page.html",
"name":"Foo",
"description":"Bar"
},
{
"@context": "https://schema.org",
"@type": "BreadcrumbList",
"itemListElement":[
{
"@type":"ListItem",
"position":1,
"item":{
"@id":"https://www.example.com",
"name":"Baz"
}
}
]
}
</script>
<script type="application/ld+json">
{
"@context":"https://schema.org",
"@type":"NewsArticle",
"mainEntityOfPage": {
"@type":"WebPage",
"@id": "http://example.com/page.html"
},
"headline": "Foo",
"description": "Bar",
"datePublished": "2022-01-31T19:46:00+00:00",
"author": {
"@type": "Organization",
"name": "Hoge"
},
"publisher": {
"@type": "NewsMediaOrganization",
"name":"Baz",
"url":"https://example.com/"
}
}
</script>
</body>
</html>
HTML
describe '#title' do
it 'returns the title from structured data' do
expect(subject.title).to eq 'Foo'
end
end
describe '#description' do
it 'returns the description from structured data' do
expect(subject.description).to eq 'Bar'
end
end
describe '#provider_name' do
it 'returns the provider name from structured data' do
expect(subject.provider_name).to eq 'Baz'
end
end
describe '#author_name' do
it 'returns the author name from structured data' do
expect(subject.author_name).to eq 'Hoge'
end
end
end
end
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment