class String
Some useful additions to the String
class Copyright © 2010-16, Jacques Distler. All rights reserved. Licensed under a triple GPL/MPL/LGPL License.
Constants
- UTF8_REGEX
Public Instance Methods
Source
# File lib/itex_stringsupport.rb, line 34 def as_bytes force_encoding("ASCII-8BIT") end
Source
# File lib/itex_stringsupport.rb, line 81 def check_ncrs text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' } text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' } end
Source
# File lib/itex_stringsupport.rb, line 107 def is_utf8? #expand NCRs to utf-8 text = self.check_ncrs.as_bytes # You might think this is faster, but it isn't #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/) #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')} #pieces = pieces.join.split(/&#(\d+);/) #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')} #text = pieces.join #ensure the resulting string of bytes is valid utf-8 text =~ UTF8_REGEX end
Check whether a string is valid utf-8
returns true if the sequence of bytes in string is valid utf-8
Source
# File lib/itex_stringsupport.rb, line 70 def purify text = self.dup.check_ncrs.as_utf8 text.chars.collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8 end
Source
# File lib/itex_stringsupport.rb, line 2263 def to_ncr self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr} end
Converts XHTML+MathML named entities in string to Numeric Character References
Source
# File lib/itex_stringsupport.rb, line 2274 def to_ncr! self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_ncr} end
Converts XHTML+MathML named entities in string to Numeric Character References
Substitution is done in-place.
Source
# File lib/itex_stringsupport.rb, line 2284 def to_utf8 self.gsub(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8} # You might think this is faster, but it isn't # pieces = self.split(/&([a-zA-Z0-9]+);/) # 1.step(pieces.length-1, 2) {|i| pieces[i].convert_to_utf8} # pieces.join end
Converts XHTML+MathML named entities in string to UTF-8
Source
# File lib/itex_stringsupport.rb, line 2301 def to_utf8! self.gsub!(/&([a-zA-Z0-9]+);/) {|m| $1.convert_to_utf8} end
++
Converts XHTML+MathML named entities in string to UTF-8 Substitution is done in-place.