diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java index acc53ca885..f4c21f9474 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java +++ b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java @@ -3,8 +3,8 @@ * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -14,50 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.config; -import static java.nio.charset.StandardCharsets.UTF_8; +package org.apache.tika.config; -import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.net.URL; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; +import java.util.*; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.utils.ServiceLoaderUtils; -/** - * Internal utility class that Tika uses to look up service providers. - * - * @since Apache Tika 0.9 - */ public class ServiceLoader { - - /** - * The dynamic set of services available in an OSGi environment. - * Managed by the {@link TikaActivator} class and used as an additional - * source of service instances in the {@link #loadServiceProviders(Class)} - * method. - */ private static final Map SERVICES = new HashMap<>(); - private static final Pattern COMMENT = Pattern.compile("#.*"); - private static final Pattern WHITESPACE = Pattern.compile("\\s+"); - /** - * The default context class loader to use for all threads, or - * null to automatically select the context class loader. - */ - private static volatile ClassLoader CONTEXT_CLASS_LOADER = null; + private static final ClassLoader CONTEXT_CLASS_LOADER = null; private final ClassLoader loader; private final LoadErrorHandler handler; private final InitializableProblemHandler initializableProblemHandler; @@ -69,8 +39,8 @@ public ServiceLoader(ClassLoader loader, LoadErrorHandler handler, this.handler = handler; this.initializableProblemHandler = initializableProblemHandler; this.dynamic = dynamic; - } + public ServiceLoader(ClassLoader loader, LoadErrorHandler handler, boolean dynamic) { this(loader, handler, InitializableProblemHandler.WARN, dynamic); } @@ -91,125 +61,46 @@ public ServiceLoader() { LoadErrorHandler.IGNORE, true); } - /** - * Returns the context class loader of the current thread. If such - * a class loader is not available, then the loader of this class or - * finally the system class loader is returned. - * - * @return context class loader, or null if no loader - * is available - * @see TIKA-441 - */ - static ClassLoader getContextClassLoader() { + public static ClassLoader getContextClassLoader() { ClassLoader loader = CONTEXT_CLASS_LOADER; - if (loader == null) { - loader = ServiceLoader.class.getClassLoader(); - } - if (loader == null) { - loader = ClassLoader.getSystemClassLoader(); - } + if (loader == null) loader = ServiceLoader.class.getClassLoader(); + if (loader == null) loader = ClassLoader.getSystemClassLoader(); return loader; } - /** - * Sets the context class loader to use for all threads that access - * this class. Used for example in an OSGi environment to avoid problems - * with the default context class loader. - * - * @param loader default context class loader, - * or null to automatically pick the loader - */ - public static void setContextClassLoader(ClassLoader loader) { - CONTEXT_CLASS_LOADER = loader; - } - static void addService(Object reference, Object service, int rank) { synchronized (SERVICES) { SERVICES.put(reference, new RankedService(service, rank)); } } - static Object removeService(Object reference) { + static void removeService(Object reference) { synchronized (SERVICES) { - return SERVICES.remove(reference); + SERVICES.remove(reference); } } - /** - * Returns if the service loader is static or dynamic - * - * @return dynamic or static loading - * @since Apache Tika 1.10 - */ public boolean isDynamic() { return dynamic; } - /** - * Returns the load error handler used by this loader. - * - * @return load error handler - * @since Apache Tika 1.3 - */ public LoadErrorHandler getLoadErrorHandler() { return handler; } - /** - * Returns the handler for problems with initializables - * - * @return handler for problems with initializables - * @since Apache Tika 1.15.1 - */ public InitializableProblemHandler getInitializableProblemHandler() { return initializableProblemHandler; } - /** - * Returns an input stream for reading the specified resource from the - * configured class loader. - * - * @param name resource name - * @return input stream, or null if the resource was not found - * @see ClassLoader#getResourceAsStream(String) - * @since Apache Tika 1.1 - */ public InputStream getResourceAsStream(String name) { - if (loader != null) { - return loader.getResourceAsStream(name); - } else { - return null; - } + return loader != null ? loader.getResourceAsStream(name) : null; } - /** - * @return ClassLoader used by this ServiceLoader - * @see #getContextClassLoader() for the context's ClassLoader - * @since Apache Tika 1.15.1 - */ public ClassLoader getLoader() { return loader; } - /** - * Loads and returns the named service class that's expected to implement - * the given interface. - *

- * Note that this class does not use the {@link LoadErrorHandler}, a - * {@link ClassNotFoundException} is always returned for unknown - * classes or classes of the wrong type - * - * @param iface service interface - * @param name service class name - * @return service class - * @throws ClassNotFoundException if the service class can not be found - * or does not implement the given interface - * @see Class#forName(String, boolean, ClassLoader) - * @since Apache Tika 1.1 - */ - @SuppressWarnings("unchecked") - public Class getServiceClass(Class iface, String name) - throws ClassNotFoundException { + public Class getServiceClass(Class iface, String name) throws ClassNotFoundException { if (loader == null) { throw new ClassNotFoundException("Service class " + name + " is not available"); } @@ -217,37 +108,20 @@ public Class getServiceClass(Class iface, String name) if (klass.isInterface()) { throw new ClassNotFoundException("Service class " + name + " is an interface"); } else if (!iface.isAssignableFrom(klass)) { - throw new ClassNotFoundException( - "Service class " + name + " does not implement " + iface.getName()); + throw new ClassNotFoundException("Service class " + name + " does not implement " + iface.getName()); } else { return (Class) klass; } } - /** - * Returns all the available service resources matching the - * given pattern, such as all instances of tika-mimetypes.xml - * on the classpath, or all org.apache.tika.parser.Parser - * service files. - */ public Enumeration findServiceResources(String filePattern) { try { return loader.getResources(filePattern); } catch (IOException ignore) { - // We couldn't get the list of service resource files - List empty = Collections.emptyList(); - return Collections.enumeration(empty); + return Collections.enumeration(Collections.emptyList()); } } - /** - * Returns all the available service providers of the given type. - * - * As of versions after 2.4.1, this removes duplicate classes - * - * @param iface service provider interface - * @return available service providers - */ public List loadServiceProviders(Class iface) { List tmp = new ArrayList<>(); tmp.addAll(loadDynamicServiceProviders(iface)); @@ -256,7 +130,7 @@ public List loadServiceProviders(Class iface) { List providers = new ArrayList<>(); Set seen = new HashSet<>(); for (T provider : tmp) { - if (! seen.contains(provider.getClass().getCanonicalName())) { + if (!seen.contains(provider.getClass().getCanonicalName())) { providers.add(provider); seen.add(provider.getClass().getCanonicalName()); } @@ -264,15 +138,6 @@ public List loadServiceProviders(Class iface) { return providers; } - /** - * Returns the available dynamic service providers of the given type. - * The returned list is newly allocated and may be freely modified - * by the caller. - * - * @param iface service provider interface - * @return dynamic service providers - * @since Apache Tika 1.2 - */ @SuppressWarnings("unchecked") public List loadDynamicServiceProviders(Class iface) { if (dynamic) { @@ -293,16 +158,6 @@ public List loadDynamicServiceProviders(Class iface) { } } - /** - * Returns the defined static service providers of the given type, without - * attempting to load them. - * The providers are loaded using the service provider mechanism using - * the configured class loader (if any). - * - * @param iface service provider interface - * @return static list of uninitialised service providers - * @since Apache Tika 1.6 - */ protected List identifyStaticServiceProviders(Class iface) { List names = new ArrayList<>(); @@ -311,7 +166,7 @@ protected List identifyStaticServiceProviders(Class iface) { Enumeration resources = findServiceResources("META-INF/services/" + serviceName); for (URL resource : Collections.list(resources)) { try { - collectServiceClassNames(resource, names); + ServiceResourceUtils.collectServiceClassNames(resource, names); } catch (IOException e) { handler.handleLoadError(serviceName, e); } @@ -321,24 +176,12 @@ protected List identifyStaticServiceProviders(Class iface) { return names; } - public List loadStaticServiceProviders(Class iface) { + public List loadStaticServiceProviders(Class iface) { return loadStaticServiceProviders(iface, Collections.EMPTY_SET); } - /** - * Returns the available static service providers of the given type. - * The providers are loaded using the service provider mechanism using - * the configured class loader (if any). The returned list is newly - * allocated and may be freely modified by the caller. - * - * @param iface service provider interface - * @param excludes -- do not load these classes - * @return static service providers - * @since Apache Tika 1.2 - */ @SuppressWarnings("unchecked") - public List loadStaticServiceProviders(Class iface, - Collection> excludes) { + public List loadStaticServiceProviders(Class iface, Collection> excludes) { List providers = new ArrayList<>(); if (loader != null) { @@ -358,14 +201,12 @@ public List loadStaticServiceProviders(Class iface, T instance = ServiceLoaderUtils.newInstance(klass, this); if (instance instanceof Initializable) { ((Initializable) instance).initialize(Collections.EMPTY_MAP); - ((Initializable) instance) - .checkInitialization(initializableProblemHandler); + ((Initializable) instance).checkInitialization(initializableProblemHandler); } providers.add(instance); } } else { - throw new TikaConfigException( - "Class " + name + " is not of type: " + iface); + throw new TikaConfigException("Class " + name + " is not of type: " + iface); } } catch (Throwable t) { handler.handleLoadError(name, t); @@ -374,40 +215,4 @@ public List loadStaticServiceProviders(Class iface, } return providers; } - - private void collectServiceClassNames(URL resource, Collection names) - throws IOException { - try (InputStream stream = resource.openStream(); - BufferedReader reader = new BufferedReader(new InputStreamReader(stream, UTF_8))) { - String line = reader.readLine(); - while (line != null) { - line = COMMENT.matcher(line).replaceFirst(""); - line = WHITESPACE.matcher(line).replaceAll(""); - if (line.length() > 0) { - names.add(line); - } - line = reader.readLine(); - } - } - } - - private static class RankedService implements Comparable { - private final Object service; - private final int rank; - - public RankedService(Object service, int rank) { - this.service = service; - this.rank = rank; - } - - public boolean isInstanceOf(Class iface) { - return iface.isAssignableFrom(service.getClass()); - } - - public int compareTo(RankedService that) { - return that.rank - rank; // highest number first - } - - } - } diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index 63c72bfef5..cec3eaab15 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -84,6 +84,8 @@ import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; +import java.util.function.Function; + /** * Parse xml config file. */ @@ -238,14 +240,14 @@ public TikaConfig() throws TikaException, IOException { LOG.debug("loading tika config from system property 'tika.config'"); } - if (config == null || config.trim().equals("")) { + if (config == null || config.trim().isEmpty()) { config = System.getenv("TIKA_CONFIG"); if (!StringUtils.isBlank(config)) { LOG.debug("loading tika config from environment variable 'TIKA_CONFIG'"); } } - if (config == null || config.trim().equals("")) { + if (config == null || config.trim().isEmpty()) { LOG.debug("loading tika config from defaults; no config file specified"); this.serviceLoader = new ServiceLoader(); this.mimeTypes = getDefaultMimeTypes(getContextClassLoader()); @@ -437,8 +439,7 @@ private static List getTopLevelElementChildren(Element element, String List elements = new ArrayList<>(); for (int i = 0; i < nodes.getLength(); i++) { Node node = nodes.item(i); - if (node instanceof Element) { - Element nodeE = (Element) node; + if (node instanceof Element nodeE) { if (childrenName.equals(nodeE.getTagName())) { elements.add(nodeE); } @@ -462,16 +463,15 @@ private static MimeTypes typesFromDomElement(Element element) } private static Set mediaTypesListFromDomElement(Element node, String tag) - throws TikaException, IOException { + throws TikaException { Set types = null; NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node cNode = children.item(i); - if (cNode instanceof Element) { - Element cElement = (Element) cNode; + if (cNode instanceof Element cElement) { if (tag.equals(cElement.getTagName())) { String mime = getText(cElement); - MediaType type = MediaType.parse(mime); + MediaType type = MediaType.parse(mime); if (type != null) { if (types == null) { types = new HashSet<>(); @@ -501,7 +501,6 @@ private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassL if (LoadErrorHandler.WARN.toString().equalsIgnoreCase(loadErrorHandleConfig)) { loadErrorHandler = LoadErrorHandler.WARN; } else if (LoadErrorHandler.THROW.toString().equalsIgnoreCase(loadErrorHandleConfig)) { - loadErrorHandler = LoadErrorHandler.THROW; } else if (LoadErrorHandler.IGNORE.toString().equalsIgnoreCase(loadErrorHandleConfig)) { loadErrorHandler = LoadErrorHandler.IGNORE; } @@ -524,22 +523,23 @@ private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassL private static InitializableProblemHandler getInitializableProblemHandler( String initializableProblemHandler) throws TikaConfigException { - if (initializableProblemHandler == null || initializableProblemHandler.length() == 0) { + if (initializableProblemHandler == null || initializableProblemHandler.isEmpty()) { return InitializableProblemHandler.DEFAULT; } - if (InitializableProblemHandler.IGNORE.toString() - .equalsIgnoreCase(initializableProblemHandler)) { - return InitializableProblemHandler.IGNORE; - } else if (InitializableProblemHandler.INFO.toString() - .equalsIgnoreCase(initializableProblemHandler)) { - return InitializableProblemHandler.INFO; - } else if (InitializableProblemHandler.WARN.toString() - .equalsIgnoreCase(initializableProblemHandler)) { - return InitializableProblemHandler.WARN; - } else if (InitializableProblemHandler.THROW.toString() - .equalsIgnoreCase(initializableProblemHandler)) { - return InitializableProblemHandler.THROW; + + Map> strategyMap = new HashMap<>(); + strategyMap.put("ignore", v -> InitializableProblemHandler.IGNORE); + strategyMap.put("info", v -> InitializableProblemHandler.INFO); + strategyMap.put("warn", v -> InitializableProblemHandler.WARN); + strategyMap.put("throw", v -> InitializableProblemHandler.THROW); + + Function strategy = + strategyMap.get(initializableProblemHandler.toLowerCase(Locale.US)); + + if (strategy != null) { + return strategy.apply(null); } + throw new TikaConfigException(String.format(Locale.US, "Couldn't parse non-null '%s'. Must be one of 'ignore', 'info', 'warn' or 'throw'", initializableProblemHandler)); @@ -547,7 +547,7 @@ private static InitializableProblemHandler getInitializableProblemHandler( public static void mustNotBeEmpty(String paramName, String paramValue) throws TikaConfigException { - if (paramValue == null || paramValue.trim().equals("")) { + if (paramValue == null || paramValue.trim().isEmpty()) { throw new IllegalArgumentException( "parameter '" + paramName + "' must be set in the config file"); } @@ -701,15 +701,9 @@ CT loadOverall(Element element, MimeTypes mimeTypes, ServiceLoader loader) } } else if (!supportsComposite()) { // No composite support, just return the first one - if (loaded.size() == 1) { - return (CT) loaded.get(0); - } else if (loaded.size() > 1) { - throw new TikaConfigException( - "Composite not supported for " + getParentTagName() + - ". Must specify only one child!"); - } else { - //throw exception if empty? - } + throw new TikaConfigException( + "Composite not supported for " + getParentTagName() + + ". Must specify only one child!"); } // Wrap the defined parsers/detectors up in a Composite return createComposite(loaded, mimeTypes, loader); @@ -718,18 +712,15 @@ CT loadOverall(Element element, MimeTypes mimeTypes, ServiceLoader loader) T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) throws TikaException, IOException { String name = element.getAttribute("class"); - if (name == null) { - throw new TikaConfigException("class attribute must not be null: " + element); - } String initProbHandler = element.getAttribute("initializableProblemHandler"); InitializableProblemHandler initializableProblemHandler; - if (initProbHandler == null || initProbHandler.length() == 0) { + if (initProbHandler.isEmpty()) { initializableProblemHandler = loader.getInitializableProblemHandler(); } else { initializableProblemHandler = getInitializableProblemHandler(initProbHandler); } - T loaded = null; + T loaded; try { Class loadedClass = loader.getServiceClass(getLoaderClass(), name); @@ -938,18 +929,16 @@ Parser createComposite(Class parserClass, List childPa Map params, MimeTypes mimeTypes, ServiceLoader loader) throws InvocationTargetException, IllegalAccessException, InstantiationException { Parser parser = null; - Constructor c = null; + Constructor c; MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); // Try the possible default and composite parser constructors - if (parser == null) { - try { - c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, - Collection.class, EncodingDetector.class, Renderer.class); - parser = c.newInstance(registry, loader, excludeParsers, encodingDetector, renderer); - } catch (NoSuchMethodException me) { - //swallow - } + try { + c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, + Collection.class, EncodingDetector.class, Renderer.class); + parser = c.newInstance(registry, loader, excludeParsers, encodingDetector, renderer); + } catch (NoSuchMethodException me) { + //swallow } if (parser == null) { try { @@ -999,8 +988,8 @@ Parser createComposite(Class parserClass, List childPa // Create as a Parser Decorator if (parser == null && ParserDecorator.class.isAssignableFrom(parserClass)) { try { - CompositeParser cp = null; - if (childParsers.size() == 1 && excludeParsers.size() == 0 && + CompositeParser cp; + if (childParsers.size() == 1 && excludeParsers.isEmpty() && childParsers.get(0) instanceof CompositeParser) { cp = (CompositeParser) childParsers.get(0); } else { @@ -1019,10 +1008,10 @@ Parser createComposite(Class parserClass, List childPa Parser newInstance(Class loadedClass) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException { - Parser parser = null; + Parser parser; if (AbstractEncodingDetectorParser.class.isAssignableFrom(loadedClass)) { - Constructor ctor = loadedClass.getConstructor(EncodingDetector.class); - parser = (Parser) ctor.newInstance(encodingDetector); + Constructor ctor = loadedClass.getConstructor(EncodingDetector.class); + parser = ctor.newInstance(encodingDetector); } else { parser = loadedClass.getDeclaredConstructor().newInstance(); } @@ -1034,7 +1023,7 @@ Parser newInstance(Class loadedClass) } @Override - Parser decorate(Parser created, Element element) throws IOException, TikaException { + Parser decorate(Parser created, Element element) throws TikaException { Parser parser = created; // Is there an explicit list of mime types for this to handle? @@ -1074,7 +1063,7 @@ Class getLoaderClass() { @Override Detector preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + MimeTypes mimeTypes) { // If they asked for the mime types as a detector, give // them the one we've already created. TIKA-1708 if (MimeTypes.class.equals(loadedClass)) { @@ -1118,14 +1107,12 @@ Detector createComposite(Class detectorClass, MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); // Try the possible default and composite detector constructors - if (detector == null) { - try { - c = detectorClass - .getConstructor(MimeTypes.class, ServiceLoader.class, Collection.class); - detector = c.newInstance(mimeTypes, loader, excludeDetectors); - } catch (NoSuchMethodException me) { - //swallow - } + try { + c = detectorClass + .getConstructor(MimeTypes.class, ServiceLoader.class, Collection.class); + detector = c.newInstance(mimeTypes, loader, excludeDetectors); + } catch (NoSuchMethodException me) { + //swallow } if (detector == null) { try { @@ -1182,7 +1169,7 @@ Class getLoaderClass() { @Override Translator preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + MimeTypes mimeTypes) { // Continue with normal loading return null; } @@ -1214,7 +1201,7 @@ Translator createComposite(Class compositeClass, Set> excludeChildren, Map params, MimeTypes mimeTypes, ServiceLoader loader) - throws InvocationTargetException, IllegalAccessException, InstantiationException { + throws InstantiationException { throw new InstantiationException("Only one translator supported"); } @@ -1232,7 +1219,7 @@ ConfigurableThreadPoolExecutor createComposite( List children, Set> excludeChildren, Map params, MimeTypes mimeTypes, ServiceLoader loader) - throws InvocationTargetException, IllegalAccessException, InstantiationException { + throws InstantiationException { throw new InstantiationException("Only one executor service supported"); } @@ -1249,7 +1236,7 @@ ConfigurableThreadPoolExecutor createDefault(MimeTypes mimeTypes, ServiceLoader @Override ConfigurableThreadPoolExecutor decorate(ConfigurableThreadPoolExecutor created, - Element element) throws IOException, TikaException { + Element element){ Element maxThreadElement = getChild(element, "max-threads"); if (maxThreadElement != null) { @@ -1303,7 +1290,7 @@ boolean isComposite(Class loadedClass) @Override ConfigurableThreadPoolExecutor preLoadOne( Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + MimeTypes mimeTypes) { return null; } } @@ -1341,7 +1328,7 @@ boolean isComposite(Class loadedClass) { @Override EncodingDetector preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + MimeTypes mimeTypes) { // Check for classes which can't be set in config // Continue with normal loading return null; @@ -1369,14 +1356,12 @@ EncodingDetector createComposite(Class encodingDetec Constructor c; // Try the possible default and composite detector constructors - if (encodingDetector == null) { - try { - c = encodingDetectorClass.getConstructor(ServiceLoader.class, Collection.class); - encodingDetector = c.newInstance(loader, excludeDetectors); - } catch (NoSuchMethodException me) { - LOG.debug("couldn't find constructor for service loader + collection for {}", - encodingDetectorClass); - } + try { + c = encodingDetectorClass.getConstructor(ServiceLoader.class, Collection.class); + encodingDetector = c.newInstance(loader, excludeDetectors); + } catch (NoSuchMethodException me) { + LOG.debug("couldn't find constructor for service loader + collection for {}", + encodingDetectorClass); } if (encodingDetector == null) { try { @@ -1430,7 +1415,7 @@ boolean isComposite(Class loadedClass) { @Override Renderer preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + MimeTypes mimeTypes) { // Check for classes which can't be set in config // Continue with normal loading return null; @@ -1457,24 +1442,12 @@ Renderer createComposite(Class rendererClass, Renderer renderer = null; Constructor c; - // Try the possible default and composite detector constructors - if (renderer == null) { - try { - c = rendererClass.getConstructor(ServiceLoader.class, Collection.class); - renderer = c.newInstance(loader, excludeRenderers); - } catch (NoSuchMethodException me) { - LOG.debug("couldn't find constructor for service loader + collection for {}", - renderer); - } - } - if (renderer == null) { - try { - c = rendererClass.getConstructor(List.class); - renderer = c.newInstance(childRenderers); - } catch (NoSuchMethodException me) { - LOG.debug("couldn't find constructor for Renderer(List) for {}", - rendererClass); - } + try { + c = rendererClass.getConstructor(List.class); + renderer = c.newInstance(childRenderers); + } catch (NoSuchMethodException me) { + LOG.debug("couldn't find constructor for Renderer(List) for {}", + rendererClass); } return renderer; } diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java index ed53918540..2765655bb3 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java @@ -3,8 +3,8 @@ * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.InputStream; +import java.io.Serial; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -38,11 +39,11 @@ public class CompositeDetector implements Detector { /** * Serial version UID */ + @Serial private static final long serialVersionUID = 5980683158436430252L; - private final MediaTypeRegistry registry; - - private final List detectors; + protected final MediaTypeRegistry registry; + protected final List detectors; public CompositeDetector(MediaTypeRegistry registry, List detectors, Collection> excludeDetectors) { @@ -78,8 +79,6 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } MediaType type = MediaType.OCTET_STREAM; - //we have to iterate through all detectors because the override detector may - //be within a CompositeDetector for (Detector detector : getDetectors()) { MediaType detected = detector.detect(input, metadata); if (registry.isSpecializationOf(detected, type)) { @@ -90,11 +89,10 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } /** - * - * @param metadata + * @param metadata Metadata object * @return mediaType if a parseable mediatype was sent in via user or parser overrides */ - private static MediaType detectOverrides(Metadata metadata) { + protected static MediaType detectOverrides(Metadata metadata) { String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE); if (!StringUtils.isBlank(override)) { MediaType mt = MediaType.parse(override); @@ -104,13 +102,11 @@ private static MediaType detectOverrides(Metadata metadata) { } override = metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE); if (!StringUtils.isBlank(override)) { - MediaType mt = MediaType.parse(override); - if (mt != null) { - return mt; - } + return MediaType.parse(override); } return null; } + /** * Returns the component detectors. */ @@ -118,13 +114,13 @@ public List getDetectors() { return Collections.unmodifiableList(detectors); } - private boolean isExcluded(Collection> excludeDetectors, - Class d) { + protected boolean isExcluded(Collection> excludeDetectors, + Class d) { return excludeDetectors.contains(d) || assignableFrom(excludeDetectors, d); } - private boolean assignableFrom(Collection> excludeDetectors, - Class d) { + protected boolean assignableFrom(Collection> excludeDetectors, + Class d) { for (Class e : excludeDetectors) { if (e.isAssignableFrom(d)) { return true; diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java index 038d274e46..dae8ab1551 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java @@ -1,12 +1,12 @@ /* * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with + * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,6 +16,7 @@ */ package org.apache.tika.detect; +import java.io.Serial; import java.util.Collection; import java.util.Collections; import java.util.List; @@ -28,21 +29,14 @@ /** * A composite detector based on all the {@link Detector} implementations * available through the {@link ServiceRegistry service provider mechanism}. - *

- * Detectors are loaded and returned in a specified order, of user supplied - * followed by non-MimeType Tika, followed by the Tika MimeType class. - * If you need to control the order of the Detectors, you should instead - * construct your own {@link CompositeDetector} and pass in the list - * of Detectors in the required order. * * @since Apache Tika 0.9 */ public class DefaultDetector extends CompositeDetector { - /** - * Serial version UID - */ + @Serial private static final long serialVersionUID = -8170114575326908027L; + private transient final ServiceLoader loader; public DefaultDetector(MimeTypes types, ServiceLoader loader, @@ -52,7 +46,7 @@ public DefaultDetector(MimeTypes types, ServiceLoader loader, } public DefaultDetector(MimeTypes types, ServiceLoader loader) { - this(types, loader, Collections.EMPTY_SET); + this(types, loader, Collections.emptySet()); } public DefaultDetector(MimeTypes types, ClassLoader loader) { @@ -76,52 +70,40 @@ public DefaultDetector() { * rather than discovery order. Detectors are used in the given order, * so put the Tika parsers last so that non-Tika (user supplied) * parsers can take precedence. - *

- * If an {@link OverrideDetector} is loaded, it takes precedence over - * all other detectors. * * @param loader service loader * @return ordered list of statically loadable detectors */ private static List getDefaultDetectors(MimeTypes types, ServiceLoader loader, - Collection> - excludeDetectors) { - List detectors = - loader.loadStaticServiceProviders(Detector.class, excludeDetectors); - + Collection> excludeDetectors) { + List detectors = loader.loadStaticServiceProviders(Detector.class, excludeDetectors); ServiceLoaderUtils.sortLoadedClasses(detectors); - //look for the override index and put that first + int overrideIndex = -1; - int i = 0; - for (Detector detector : detectors) { - if (detector instanceof OverrideDetector) { + for (int i = 0; i < detectors.size(); i++) { + if (detectors.get(i) instanceof OverrideDetector) { overrideIndex = i; break; } - i++; } if (overrideIndex > -1) { - Detector detector = detectors.remove(overrideIndex); - detectors.add(0, detector); + Detector override = detectors.remove(overrideIndex); + detectors.add(0, override); } - // Finally the Tika MimeTypes as a fallback - detectors.add(types); + + detectors.add(types); // fallback return detectors; } @Override public List getDetectors() { if (loader != null && loader.isDynamic()) { - List detectors = loader.loadDynamicServiceProviders(Detector.class); - if (detectors.size() > 0) { - detectors.addAll(super.getDetectors()); - return detectors; - } else { - return super.getDetectors(); + List dynamicDetectors = loader.loadDynamicServiceProviders(Detector.class); + if (!dynamicDetectors.isEmpty()) { + dynamicDetectors.addAll(super.getDetectors()); + return dynamicDetectors; } - } else { - return super.getDetectors(); } + return super.getDetectors(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java index 50f8d790aa..bf4ef585af 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java +++ b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java @@ -23,14 +23,25 @@ */ public class TextStatistics { - private final int[] counts = new int[256]; - - private int total = 0; + private static final int ASCII_CONTROL_END = 0x20; + private static final int ASCII_PRINTABLE_START = 0x20; + private static final int ASCII_PRINTABLE_END = 0x80; + private static final int UTF8_CONTINUATION_START = 0x80; + private static final int UTF8_CONTINUATION_END = 0xc0; + private static final int UTF8_2BYTE_START = 0xc0; + private static final int UTF8_2BYTE_END = 0xe0; + private static final int UTF8_3BYTE_END = 0xf0; + private static final int UTF8_4BYTE_END = 0xf8; + private static final int INVALID_UTF8_START = 0xf8; + private static final int INVALID_UTF8_END = 0x100; + + private final int[] byteFrequencies = new int[256]; + private int totalBytes = 0; public void addData(byte[] buffer, int offset, int length) { for (int i = 0; i < length; i++) { - counts[buffer[offset + i] & 0xff]++; - total++; + byteFrequencies[buffer[offset + i] & 0xff]++; + totalBytes++; } } @@ -44,10 +55,12 @@ public void addData(byte[] buffer, int offset, int length) { * @see TIKA-688 */ public boolean isMostlyAscii() { - int control = count(0, 0x20); - int ascii = count(0x20, 128); - int safe = countSafeControl(); - return total > 0 && (control - safe) * 100 < total * 2 && (ascii + safe) * 100 > total * 90; + int controlCount = countRange(0, ASCII_CONTROL_END); + int asciiPrintableCount = countRange(ASCII_PRINTABLE_START, ASCII_PRINTABLE_END); + int safeControlCount = countSafeControl(); + return totalBytes > 0 && + (controlCount - safeControlCount) * 100 < totalBytes * 2 && + (asciiPrintableCount + safeControlCount) * 100 > totalBytes * 90; } /** @@ -58,21 +71,29 @@ public boolean isMostlyAscii() { * @since Apache Tika 1.3 */ public boolean looksLikeUTF8() { - int control = count(0, 0x20); - int utf8 = count(0x20, 0x80); - int safe = countSafeControl(); + int controlCount = countRange(0, ASCII_CONTROL_END); + int asciiUtf8Count = countRange(ASCII_PRINTABLE_START, ASCII_PRINTABLE_END); + int safeControlCount = countSafeControl(); int expectedContinuation = 0; - int[] leading = new int[]{count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8)}; - for (int i = 0; i < leading.length; i++) { - utf8 += leading[i]; - expectedContinuation += (i + 1) * leading[i]; + int[] leadingBytes = new int[]{ + countRange(UTF8_2BYTE_START, UTF8_2BYTE_END), + countRange(UTF8_2BYTE_END, UTF8_3BYTE_END), + countRange(UTF8_3BYTE_END, UTF8_4BYTE_END) + }; + + for (int i = 0; i < leadingBytes.length; i++) { + asciiUtf8Count += leadingBytes[i]; + expectedContinuation += (i + 1) * leadingBytes[i]; } - int continuation = count(0x80, 0xc0); - return utf8 > 0 && continuation <= expectedContinuation && - continuation >= expectedContinuation - 3 && count(0xf8, 0x100) == 0 && - (control - safe) * 100 < utf8 * 2; + int continuationCount = countRange(UTF8_CONTINUATION_START, UTF8_CONTINUATION_END); + + return asciiUtf8Count > 0 && + continuationCount <= expectedContinuation && + continuationCount >= expectedContinuation - 3 && + countRange(INVALID_UTF8_START, INVALID_UTF8_END) == 0 && + (controlCount - safeControlCount) * 100 < asciiUtf8Count * 2; } /** @@ -81,7 +102,7 @@ public boolean looksLikeUTF8() { * @return count of all bytes */ public int count() { - return total; + return totalBytes; } /** @@ -91,7 +112,7 @@ public int count() { * @return count of the given byte */ public int count(int b) { - return counts[b & 0xff]; + return byteFrequencies[b & 0xff]; } /** @@ -117,7 +138,7 @@ public int count(int b) { * @see TIKA-154 */ public int countControl() { - return count(0, 0x20) - countSafeControl(); + return countRange(0, ASCII_CONTROL_END) - countSafeControl(); } /** @@ -127,7 +148,7 @@ public int countControl() { * @see #countControl() */ public int countSafeAscii() { - return count(0x20, 128) + countSafeControl(); + return countRange(ASCII_PRINTABLE_START, ASCII_PRINTABLE_END) + countSafeControl(); } /** @@ -136,21 +157,20 @@ public int countSafeAscii() { * @return count of eight bit characters */ public int countEightBit() { - return count(128, 256); + return countRange(128, 256); } - private int count(int from, int to) { - assert 0 <= from && to <= counts.length; - int count = 0; + private int countRange(int from, int to) { + assert 0 <= from && to <= byteFrequencies.length; + int sum = 0; for (int i = from; i < to; i++) { - count += counts[i]; + sum += byteFrequencies[i]; } - return count; + return sum; } private int countSafeControl() { return count('\t') + count('\n') + count('\r') // tab, LF, CR + count(0x0c) + count(0x1b); // new page, escape } - -} +} \ No newline at end of file diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java index ac5b3add87..9de1f3a648 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java @@ -183,25 +183,31 @@ public boolean isInstanceOf(String a, MediaType b) { * @since Apache Tika 0.8 */ public MediaType getSupertype(MediaType type) { - if (type == null) { - return null; - } else if (inheritance.containsKey(type)) { - return inheritance.get(type); - } else if (type.hasParameters()) { - return type.getBaseType(); - } else if (type.getSubtype().endsWith("+xml")) { - return MediaType.APPLICATION_XML; - } else if (type.getSubtype().endsWith("+zip")) { - return MediaType.APPLICATION_ZIP; - } else if ("text".equals(type.getType()) && !MediaType.TEXT_PLAIN.equals(type)) { - return MediaType.TEXT_PLAIN; - } else if (type.getType().contains("empty") && !MediaType.EMPTY.equals(type)) { - return MediaType.EMPTY; - } else if (!MediaType.OCTET_STREAM.equals(type)) { - return MediaType.OCTET_STREAM; - } else { - return null; - } + if (type == null) return null; + if (inheritance.containsKey(type)) return inheritance.get(type); + if (type.hasParameters()) return type.getBaseType(); + if (isXmlSubtype(type)) return MediaType.APPLICATION_XML; + if (isZipSubtype(type)) return MediaType.APPLICATION_ZIP; + if (isTextType(type)) return MediaType.TEXT_PLAIN; + if (isEmptyType(type)) return MediaType.EMPTY; + if (!MediaType.OCTET_STREAM.equals(type)) return MediaType.OCTET_STREAM; + return null; + } + + private boolean isXmlSubtype(MediaType type) { + return type.getSubtype().endsWith("+xml"); + } + + private boolean isZipSubtype(MediaType type) { + return type.getSubtype().endsWith("+zip"); + } + + private boolean isTextType(MediaType type) { + return "text".equals(type.getType()) && !MediaType.TEXT_PLAIN.equals(type); + } + + private boolean isEmptyType(MediaType type) { + return type.getType().contains("empty") && !MediaType.EMPTY.equals(type); } } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java index 76bc5c7525..c831d1ffa1 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java @@ -260,65 +260,76 @@ public InputSource resolveEntity(String publicId, String systemId) { @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { - if (type == null) { - if (MIME_TYPE_TAG.equals(qName)) { - String name = attributes.getValue(MIME_TYPE_TYPE_ATTR); - String interpretedAttr = attributes.getValue(INTERPRETED_ATTR); - boolean interpreted = "true".equals(interpretedAttr); - try { - type = types.forName(name); - type.setInterpreted(interpreted); - } catch (MimeTypeException e) { - handleMimeError(name, e, qName, attributes); - } - } - } else if (ALIAS_TAG.equals(qName)) { - String alias = attributes.getValue(ALIAS_TYPE_ATTR); - types.addAlias(type, MediaType.parse(alias)); - } else if (SUB_CLASS_OF_TAG.equals(qName)) { - String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR); - types.setSuperType(type, MediaType.parse(parent)); - } else if (ACRONYM_TAG.equals(qName) || COMMENT_TAG.equals(qName) || - TIKA_LINK_TAG.equals(qName) || TIKA_UTI_TAG.equals(qName)) { - characters = new StringBuilder(); - } else if (GLOB_TAG.equals(qName)) { - String pattern = attributes.getValue(PATTERN_ATTR); - String isRegex = attributes.getValue(ISREGEX_ATTR); - if (pattern != null) { - try { - types.addPattern(type, pattern, Boolean.parseBoolean(isRegex)); - } catch (MimeTypeException e) { - handleGlobError(type, pattern, e, qName, attributes); - } - } - } else if (ROOT_XML_TAG.equals(qName)) { - String namespace = attributes.getValue(NS_URI_ATTR); - String name = attributes.getValue(LOCAL_NAME_ATTR); - type.addRootXML(namespace, name); - } else if (MATCH_TAG.equals(qName)) { - if (attributes.getValue(MATCH_MINSHOULDMATCH_ATTR) != null) { - current = new ClauseRecord(new MinShouldMatchVal( - Integer.parseInt(attributes.getValue(MATCH_MINSHOULDMATCH_ATTR)))); - } else { - String kind = attributes.getValue(MATCH_TYPE_ATTR); - String offset = attributes.getValue(MATCH_OFFSET_ATTR); - String value = attributes.getValue(MATCH_VALUE_ATTR); - String mask = attributes.getValue(MATCH_MASK_ATTR); - if (kind == null) { - kind = "string"; - } - current = - new ClauseRecord(new MagicMatch(type.getType(), kind, offset, value, mask)); - } - } else if (MAGIC_TAG.equals(qName)) { - String value = attributes.getValue(MAGIC_PRIORITY_ATTR); - if (value != null && value.length() > 0) { - priority = Integer.parseInt(value); - } else { - priority = 50; + try { + switch (qName) { + case MIME_TYPE_TAG -> handleMimeTypeTag(attributes); + case ALIAS_TAG -> handleAliasTag(attributes); + case SUB_CLASS_OF_TAG -> handleSubClassOfTag(attributes); + case ACRONYM_TAG, COMMENT_TAG, TIKA_LINK_TAG, TIKA_UTI_TAG -> characters = new StringBuilder(); + case GLOB_TAG -> handleGlobTag(attributes); + case ROOT_XML_TAG -> handleRootXMLTag(attributes); + case MATCH_TAG -> handleMatchTag(attributes); + case MAGIC_TAG -> handleMagicTag(attributes); } - current = new ClauseRecord(null); + } catch (MimeTypeException e) { + throw new SAXException("Error processing tag: " + qName, e); + } + } + + private void handleMimeTypeTag(Attributes attributes) throws MimeTypeException { + String name = attributes.getValue(MIME_TYPE_TYPE_ATTR); + String interpretedAttr = attributes.getValue(INTERPRETED_ATTR); + boolean interpreted = "true".equals(interpretedAttr); + type = types.forName(name); + type.setInterpreted(interpreted); + } + + private void handleAliasTag(Attributes attributes) { + String alias = attributes.getValue(ALIAS_TYPE_ATTR); + types.addAlias(type, MediaType.parse(alias)); + } + + private void handleSubClassOfTag(Attributes attributes) { + String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR); + types.setSuperType(type, MediaType.parse(parent)); + } + + private void handleGlobTag(Attributes attributes) throws MimeTypeException { + String pattern = attributes.getValue(PATTERN_ATTR); + String isRegex = attributes.getValue(ISREGEX_ATTR); + if (pattern != null) { + types.addPattern(type, pattern, Boolean.parseBoolean(isRegex)); + } + } + + private void handleRootXMLTag(Attributes attributes) { + String namespace = attributes.getValue(NS_URI_ATTR); + String name = attributes.getValue(LOCAL_NAME_ATTR); + type.addRootXML(namespace, name); + } + + private void handleMatchTag(Attributes attributes) { + if (attributes.getValue(MATCH_MINSHOULDMATCH_ATTR) != null) { + current = new ClauseRecord(new MinShouldMatchVal( + Integer.parseInt(attributes.getValue(MATCH_MINSHOULDMATCH_ATTR)))); + } else { + String kind = attributes.getValue(MATCH_TYPE_ATTR); + String offset = attributes.getValue(MATCH_OFFSET_ATTR); + String value = attributes.getValue(MATCH_VALUE_ATTR); + String mask = attributes.getValue(MATCH_MASK_ATTR); + if (kind == null) kind = "string"; + current = new ClauseRecord(new MagicMatch(type.getType(), kind, offset, value, mask)); + } + } + + private void handleMagicTag(Attributes attributes) { + String value = attributes.getValue(MAGIC_PRIORITY_ATTR); + if (value != null && value.length() > 0) { + priority = Integer.parseInt(value); + } else { + priority = 50; } + current = new ClauseRecord(null); } @Override