Class String
In: lib/feedparser/text-output.rb
lib/feedparser/textconverters.rb
Parent: Object
dot/f_7.png

This class provides various converters

Methods

Constants

MY_ENTITIES = {}

Public Instance methods

[Source]

    # File lib/feedparser/textconverters.rb, line 17
17:   def escape_html
18:     r = self.gsub('&', '&')
19:     r = r.gsub('<', '&lt;')
20:     r = r.gsub('>', '&gt;')
21:     r
22:   end

returns true if the text contains escaped HTML (with HTML entities). used by String#text2html

[Source]

    # File lib/feedparser/textconverters.rb, line 13
13:   def escaped_html?
14:     return (self =~ /&lt;img src=/) || (self =~ /&lt;a href=/) || (self =~ /&lt;br(\/| \/|)&gt;/) || (self =~ /&lt;p&gt;/)
15:   end

Convert an HTML text to plain text

[Source]

    # File lib/feedparser/text-output.rb, line 7
 7:   def html2text
 8:     text = self.clone
 9:     # parse HTML
10:     p = FeedParser::HTML2TextParser::new(true)
11:     p.feed(text)
12:     p.close
13:     text = p.savedata
14:     # remove leading and trailing whilespace
15:     text.gsub!(/\A\s*/m, '')
16:     text.gsub!(/\s*\Z/m, '')
17:     # remove whitespace around \n
18:     text.gsub!(/ *\n/m, "\n")
19:     text.gsub!(/\n */m, "\n")
20:     # and duplicates \n
21:     text.gsub!(/\n\n+/m, "\n\n")
22:     # and remove duplicated whitespace
23:     text.gsub!(/[ \t]+/, ' ')
24:     text
25:   end

is this text HTML ? search for tags. used by String#text2html

[Source]

    # File lib/feedparser/textconverters.rb, line 8
 8:   def html?
 9:     return (self =~ /<p>/) || (self =~ /<\/p>/) || (self =~ /<br>/) || (self =~ /<br\s*(\/)?\s*>/) || (self =~ /<\/a>/) || (self =~ /<img.*>/)
10:   end

Remove white space around the text

[Source]

    # File lib/feedparser/textconverters.rb, line 83
83:   def rmWhiteSpace!
84:     return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
85:   end

convert text to HTML

[Source]

    # File lib/feedparser/textconverters.rb, line 39
39:   def text2html(feed)
40:     text = self.clone
41:     if text.html?
42:       # do nothing
43:     elsif text.escaped_html?
44:       text = text.unescape_html
45:     else
46:       # paragraphs
47:       text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
48:       text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
49:       # uris
50:       text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/,
51:           '<a href="\1">\1</a>')
52:     end
53:     # Handle broken hrefs in <a> and <img>
54:     if feed and feed.link
55:       text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m|
56:         begin
57:           first, url, last = $1, $3, $4
58:           if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/)
59:             m
60:           elsif url =~ /^\//
61:             (first + feed.link.split(/\//)[0..2].join('/') + url + last)
62:           else
63:             t = feed.link.split(/\//)
64:             if t.length == 3 # http://toto with no trailing /
65:               (first + feed.link + '/' + url + last)
66:             else
67:               if feed.link =~ /\/$/
68:                 (first + feed.link + url + last)
69:               else
70:                 (first + t[0...-1].join('/') + '/' + url + last)
71:               end
72:             end
73:           end
74:         rescue
75:           m
76:         end
77:       end
78:     end
79:     text
80:   end

Convert a text in inputenc to a text in UTF8 must take care of wrong input locales

[Source]

     # File lib/feedparser/textconverters.rb, line 89
 89:   def toUTF8(inputenc)
 90:     if inputenc.downcase != 'utf-8'
 91:       # it is said it is not UTF-8. Ensure it is REALLY not UTF-8
 92:       begin
 93:         if self.unpack('U*').pack('U*') == self
 94:           return self
 95:         end
 96:       rescue
 97:         # do nothing
 98:       end
 99:       begin
100:         return self.unpack('C*').pack('U*')
101:       rescue
102:         return self #failsafe solution. but a dirty one :-)
103:       end
104:     else
105:       return self
106:     end
107:   end

un-escape HTML in the text. used by String#text2html

[Source]

    # File lib/feedparser/textconverters.rb, line 30
30:   def unescape_html
31:     r = self
32:     MY_ENTITIES.each do |k, v|
33:       r = r.gsub(k, v)
34:     end
35:     r
36:   end

[Validate]