001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024import java.util.Collections; 025import java.util.Comparator; 026import java.util.List; 027 028import org.apache.commons.io.ByteOrderMark; 029 030/** 031 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 032 * 033 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 034 * first byte in the stream. 035 * 036 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: 037 * <ul> 038 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 039 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 040 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 041 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 042 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 043 * </ul> 044 * 045 * 046 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3> 047 * 048 * <pre> 049 * BOMInputStream bomIn = new BOMInputStream(in); 050 * if (bomIn.hasBOM()) { 051 * // has a UTF-8 BOM 052 * } 053 * </pre> 054 * 055 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3> 056 * 057 * <pre> 058 * boolean include = true; 059 * BOMInputStream bomIn = new BOMInputStream(in, include); 060 * if (bomIn.hasBOM()) { 061 * // has a UTF-8 BOM 062 * } 063 * </pre> 064 * 065 * <h3>Example 3 - Detect Multiple BOMs</h3> 066 * 067 * <pre> 068 * BOMInputStream bomIn = new BOMInputStream(in, 069 * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, 070 * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE 071 * ); 072 * if (bomIn.hasBOM() == false) { 073 * // No BOM found 074 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 075 * // has a UTF-16LE BOM 076 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 077 * // has a UTF-16BE BOM 078 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 079 * // has a UTF-32LE BOM 080 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 081 * // has a UTF-32BE BOM 082 * } 083 * </pre> 084 * 085 * @see org.apache.commons.io.ByteOrderMark 086 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 087 * @version $Id$ 088 * @since 2.0 089 */ 090public class BOMInputStream extends ProxyInputStream { 091 private final boolean include; 092 /** 093 * BOMs are sorted from longest to shortest. 094 */ 095 private final List<ByteOrderMark> boms; 096 private ByteOrderMark byteOrderMark; 097 private int[] firstBytes; 098 private int fbLength; 099 private int fbIndex; 100 private int markFbIndex; 101 private boolean markedAtStart; 102 103 /** 104 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 105 * 106 * @param delegate 107 * the InputStream to delegate to 108 */ 109 public BOMInputStream(final InputStream delegate) { 110 this(delegate, false, ByteOrderMark.UTF_8); 111 } 112 113 /** 114 * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it. 115 * 116 * @param delegate 117 * the InputStream to delegate to 118 * @param include 119 * true to include the UTF-8 BOM or false to exclude it 120 */ 121 public BOMInputStream(final InputStream delegate, final boolean include) { 122 this(delegate, include, ByteOrderMark.UTF_8); 123 } 124 125 /** 126 * Constructs a new BOM InputStream that excludes the specified BOMs. 127 * 128 * @param delegate 129 * the InputStream to delegate to 130 * @param boms 131 * The BOMs to detect and exclude 132 */ 133 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 134 this(delegate, false, boms); 135 } 136 137 /** 138 * Compares ByteOrderMark objects in descending length order. 139 */ 140 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() { 141 142 @Override 143 public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) { 144 final int len1 = bom1.length(); 145 final int len2 = bom2.length(); 146 if (len1 > len2) { 147 return EOF; 148 } 149 if (len2 > len1) { 150 return 1; 151 } 152 return 0; 153 } 154 }; 155 156 /** 157 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 158 * 159 * @param delegate 160 * the InputStream to delegate to 161 * @param include 162 * true to include the specified BOMs or false to exclude them 163 * @param boms 164 * The BOMs to detect and optionally exclude 165 */ 166 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 167 super(delegate); 168 if (boms == null || boms.length == 0) { 169 throw new IllegalArgumentException("No BOMs specified"); 170 } 171 this.include = include; 172 final List<ByteOrderMark> list = Arrays.asList(boms); 173 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 174 Collections.sort(list, ByteOrderMarkLengthComparator); 175 this.boms = list; 176 177 } 178 179 /** 180 * Indicates whether the stream contains one of the specified BOMs. 181 * 182 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 183 * @throws IOException 184 * if an error reading the first bytes of the stream occurs 185 */ 186 public boolean hasBOM() throws IOException { 187 return getBOM() != null; 188 } 189 190 /** 191 * Indicates whether the stream contains the specified BOM. 192 * 193 * @param bom 194 * The BOM to check for 195 * @return true if the stream has the specified BOM, otherwise false if it does not 196 * @throws IllegalArgumentException 197 * if the BOM is not one the stream is configured to detect 198 * @throws IOException 199 * if an error reading the first bytes of the stream occurs 200 */ 201 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 202 if (!boms.contains(bom)) { 203 throw new IllegalArgumentException("Stream not configure to detect " + bom); 204 } 205 getBOM(); 206 return byteOrderMark != null && byteOrderMark.equals(bom); 207 } 208 209 /** 210 * Return the BOM (Byte Order Mark). 211 * 212 * @return The BOM or null if none 213 * @throws IOException 214 * if an error reading the first bytes of the stream occurs 215 */ 216 public ByteOrderMark getBOM() throws IOException { 217 if (firstBytes == null) { 218 fbLength = 0; 219 // BOMs are sorted from longest to shortest 220 final int maxBomSize = boms.get(0).length(); 221 firstBytes = new int[maxBomSize]; 222 // Read first maxBomSize bytes 223 for (int i = 0; i < firstBytes.length; i++) { 224 firstBytes[i] = in.read(); 225 fbLength++; 226 if (firstBytes[i] < 0) { 227 break; 228 } 229 } 230 // match BOM in firstBytes 231 byteOrderMark = find(); 232 if (byteOrderMark != null) { 233 if (!include) { 234 if (byteOrderMark.length() < firstBytes.length) { 235 fbIndex = byteOrderMark.length(); 236 } else { 237 fbLength = 0; 238 } 239 } 240 } 241 } 242 return byteOrderMark; 243 } 244 245 /** 246 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 247 * 248 * @return The BOM charset Name or null if no BOM found 249 * @throws IOException 250 * if an error reading the first bytes of the stream occurs 251 * 252 */ 253 public String getBOMCharsetName() throws IOException { 254 getBOM(); 255 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 256 } 257 258 /** 259 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 260 * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been 261 * processed already. 262 * 263 * @return the byte read (excluding BOM) or -1 if the end of stream 264 * @throws IOException 265 * if an I/O error occurs 266 */ 267 private int readFirstBytes() throws IOException { 268 getBOM(); 269 return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; 270 } 271 272 /** 273 * Find a BOM with the specified bytes. 274 * 275 * @return The matched BOM or null if none matched 276 */ 277 private ByteOrderMark find() { 278 for (final ByteOrderMark bom : boms) { 279 if (matches(bom)) { 280 return bom; 281 } 282 } 283 return null; 284 } 285 286 /** 287 * Check if the bytes match a BOM. 288 * 289 * @param bom 290 * The BOM 291 * @return true if the bytes match the bom, otherwise false 292 */ 293 private boolean matches(final ByteOrderMark bom) { 294 // if (bom.length() != fbLength) { 295 // return false; 296 // } 297 // firstBytes may be bigger than the BOM bytes 298 for (int i = 0; i < bom.length(); i++) { 299 if (bom.get(i) != firstBytes[i]) { 300 return false; 301 } 302 } 303 return true; 304 } 305 306 // ---------------------------------------------------------------------------- 307 // Implementation of InputStream 308 // ---------------------------------------------------------------------------- 309 310 /** 311 * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM. 312 * 313 * @return the byte read (excluding BOM) or -1 if the end of stream 314 * @throws IOException 315 * if an I/O error occurs 316 */ 317 @Override 318 public int read() throws IOException { 319 final int b = readFirstBytes(); 320 return b >= 0 ? b : in.read(); 321 } 322 323 /** 324 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM. 325 * 326 * @param buf 327 * the buffer to read the bytes into 328 * @param off 329 * The start offset 330 * @param len 331 * The number of bytes to read (excluding BOM) 332 * @return the number of bytes read or -1 if the end of stream 333 * @throws IOException 334 * if an I/O error occurs 335 */ 336 @Override 337 public int read(final byte[] buf, int off, int len) throws IOException { 338 int firstCount = 0; 339 int b = 0; 340 while (len > 0 && b >= 0) { 341 b = readFirstBytes(); 342 if (b >= 0) { 343 buf[off++] = (byte) (b & 0xFF); 344 len--; 345 firstCount++; 346 } 347 } 348 final int secondCount = in.read(buf, off, len); 349 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 350 } 351 352 /** 353 * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM. 354 * 355 * @param buf 356 * the buffer to read the bytes into 357 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 358 * @throws IOException 359 * if an I/O error occurs 360 */ 361 @Override 362 public int read(final byte[] buf) throws IOException { 363 return read(buf, 0, buf.length); 364 } 365 366 /** 367 * Invokes the delegate's <code>mark(int)</code> method. 368 * 369 * @param readlimit 370 * read ahead limit 371 */ 372 @Override 373 public synchronized void mark(final int readlimit) { 374 markFbIndex = fbIndex; 375 markedAtStart = firstBytes == null; 376 in.mark(readlimit); 377 } 378 379 /** 380 * Invokes the delegate's <code>reset()</code> method. 381 * 382 * @throws IOException 383 * if an I/O error occurs 384 */ 385 @Override 386 public synchronized void reset() throws IOException { 387 fbIndex = markFbIndex; 388 if (markedAtStart) { 389 firstBytes = null; 390 } 391 392 in.reset(); 393 } 394 395 /** 396 * Invokes the delegate's <code>skip(long)</code> method, detecting and optionally skipping BOM. 397 * 398 * @param n 399 * the number of bytes to skip 400 * @return the number of bytes to skipped or -1 if the end of stream 401 * @throws IOException 402 * if an I/O error occurs 403 */ 404 @Override 405 public long skip(final long n) throws IOException { 406 int skipped = 0; 407 while ((n > skipped) && (readFirstBytes() >= 0)) { 408 skipped++; 409 } 410 return in.skip(n - skipped) + skipped; 411 } 412}