8000 [fix] Encodig · malina/metascraper@b4b0470 · GitHub
[go: up one dir, main page]

Skip to content

Commit

Permalink
[fix] Encodig
Browse files Browse the repository at this point in the history
  • Loading branch information
malina committed Mar 6, 2017
1 parent 824b393 commit b4b0470
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 19 deletions.
33 changes: 22 additions & 11 deletions src/metascraper/parser.cr
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ module Metascraper
@url = url
@response = get_request(url).as(HTTP::Client::Response)

response_body = encode_body
response_body = @response.body

@document = XML.parse_html(response_body)
@texts = Parsers::Text.new(@document)
get_charset()
@texts = Parsers::Text.new(@document, config)
@images = Parsers::Images.new(@document, config)
@videos = config.skip_video ? Videos.new : Parsers::Videos.new(@document, config)
end
Expand All @@ -35,18 +36,28 @@ module Metascraper
response
end

private def encode_body : String
charset = @response.charset.as(String) rescue Metascraper::Config::DEFAULT_CHARSET
private def get_charset : Void
from_response_charset = @response.charset.as(String)

if charset == config.charset
@response.body
unless charset_from_html.empty?
config.charset = charset_from_html
else
config.charset = charset
Utils.new(
@response.body,
charset
).encodeToUtf8.as(String)
config.charset = from_response_charset
end
end

private def charset_from_html : String
meta = @document.xpath_node("//meta[contains(@content, 'charset')]/@content")
if meta
value = meta.content
substring = "charset="
index = value.index(substring).as(Int32)
value[(index+substring.size)..(value.size-1)]
else
""
end
rescue
""
end
end
end
22 changes: 14 additions & 8 deletions src/metascraper/parsers/text_parser.cr
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
module Metascraper
module Parsers
class Text
getter document
def initialize(@document : XML::Node)
getter document, config
def initialize(@document : XML::Node, @config : Config)
end

def title
document_title || og_title
encode(document_title || og_title)
rescue
nil
end

def description
meta_descriptions = document.xpath_nodes("//meta[@name='description']")
unless meta_descriptions.empty?
meta_descriptions.first.attributes["content"].text.strip.chomp
else
secondary_description
end
description = unless meta_descriptions.empty?
meta_descriptions.first.attributes["content"].text.strip.chomp
else
secondary_description
end
encode(description)
rescue
nil
end
Expand All @@ -40,6 +41,11 @@ module Metascraper
rescue
nil
end

def encode(text : String | Nil) : String | Nil
return unless text
Utils.new(text, config.charset).encodeToUtf8.as(String)
end
end
end
end

0 comments on commit b4b0470

Please sign in to comment.
0