Class | String |
In: |
lib/feedparser/textconverters.rb
lib/feedparser/text-output.rb |
Parent: | Object |
This class provides various converters
MY_ENTITIES | = | {} |
# File lib/feedparser/textconverters.rb, line 17 17: def escape_html 18: r = self.gsub('&', '&') 19: r = r.gsub('<', '<') 20: r = r.gsub('>', '>') 21: r 22: end
returns true if the text contains escaped HTML (with HTML entities). used by String#text2html
# File lib/feedparser/textconverters.rb, line 13 13: def escaped_html? 14: return (self =~ /<img src=/) || (self =~ /<a href=/) || (self =~ /<br(\/| \/|)>/) || (self =~ /<p>/) 15: end
Convert an HTML text to plain text
# File lib/feedparser/text-output.rb, line 7 7: def html2text 8: text = self.clone 9: # parse HTML 10: p = FeedParser::HTML2TextParser::new(true) 11: p.feed(text) 12: p.close 13: text = p.savedata 14: # remove leading and trailing whilespace 15: text.gsub!(/\A\s*/m, '') 16: text.gsub!(/\s*\Z/m, '') 17: # remove whitespace around \n 18: text.gsub!(/ *\n/m, "\n") 19: text.gsub!(/\n */m, "\n") 20: # and duplicates \n 21: text.gsub!(/\n\n+/m, "\n\n") 22: text 23: end
is this text HTML ? search for tags. used by String#text2html
# File lib/feedparser/textconverters.rb, line 8 8: def html? 9: return (self =~ /<p>/) || (self =~ /<\/p>/) || (self =~ /<br>/) || (self =~ /<br\s*(\/)?\s*>/) || (self =~ /<\/a>/) || (self =~ /<img.*>/) 10: end
Remove white space around the text
# File lib/feedparser/textconverters.rb, line 83 83: def rmWhiteSpace! 84: return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'') 85: end
convert text to HTML
# File lib/feedparser/textconverters.rb, line 39 39: def text2html(feed) 40: text = self.clone 41: if text.html? 42: # do nothing 43: elsif text.escaped_html? 44: text = text.unescape_html 45: else 46: # paragraphs 47: text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>') 48: text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>") 49: # uris 50: text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/, 51: '<a href="\1">\1</a>') 52: end 53: # Handle broken hrefs in <a> and <img> 54: if feed and feed.link 55: text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m| 56: begin 57: first, url, last = $1, $3, $4 58: if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/) 59: m 60: elsif url =~ /^\// 61: (first + feed.link.split(/\//)[0..2].join('/') + url + last) 62: else 63: t = feed.link.split(/\//) 64: if t.length == 3 # http://toto with no trailing / 65: (first + feed.link + '/' + url + last) 66: else 67: if feed.link =~ /\/$/ 68: (first + feed.link + url + last) 69: else 70: (first + t[0...-1].join('/') + '/' + url + last) 71: end 72: end 73: end 74: rescue 75: m 76: end 77: end 78: end 79: text 80: end
Convert a text in inputenc to a text in UTF8 must take care of wrong input locales
# File lib/feedparser/textconverters.rb, line 89 89: def toUTF8(inputenc) 90: if inputenc.downcase != 'utf-8' 91: # it is said it is not UTF-8. Ensure it is REALLY not UTF-8 92: begin 93: if self.unpack('U*').pack('U*') == self 94: return self 95: end 96: rescue 97: # do nothing 98: end 99: begin 100: return self.unpack('C*').pack('U*') 101: rescue 102: return self #failsafe solution. but a dirty one :-) 103: end 104: else 105: return self 106: end 107: end
un-escape HTML in the text. used by String#text2html
# File lib/feedparser/textconverters.rb, line 30 30: def unescape_html 31: r = self 32: MY_ENTITIES.each do |k, v| 33: r = r.gsub(k, v) 34: end 35: r 36: end