001/*
002 * Licensed to DuraSpace under one or more contributor license agreements.
003 * See the NOTICE file distributed with this work for additional information
004 * regarding copyright ownership.
005 *
006 * DuraSpace licenses this file to you under the Apache License,
007 * Version 2.0 (the "License"); you may not use this file except in
008 * compliance with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.fcrepo.http.api;
019
020import static org.apache.commons.lang3.StringUtils.isEmpty;
021import static org.slf4j.LoggerFactory.getLogger;
022
023import java.io.File;
024import java.io.IOException;
025import java.net.URI;
026import java.nio.file.Files;
027import java.nio.file.Paths;
028import java.util.Arrays;
029import java.util.HashSet;
030import java.util.List;
031import java.util.Set;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034import java.util.stream.Collectors;
035import java.util.stream.Stream;
036
037import org.fcrepo.kernel.api.exception.ExternalMessageBodyException;
038import org.fcrepo.kernel.api.utils.AutoReloadingConfiguration;
039import org.slf4j.Logger;
040
041/**
042 * Validates external content paths to ensure that they are within a configured allowed list of paths.
043 *
044 * @author bbpennel
045 */
046public class ExternalContentPathValidator extends AutoReloadingConfiguration {
047
048    private static final Logger LOGGER = getLogger(ExternalContentPathValidator.class);
049
050    private static final Set<String> ALLOWED_SCHEMES = new HashSet<>(Arrays.asList("file", "http", "https"));
051
052    private static final Pattern SCHEME_PATTERN = Pattern.compile("^(http|https|file):/.*", Pattern.CASE_INSENSITIVE);
053
054    // Pattern to check that an http uri contains a / after the domain if a domain is present
055    private static final Pattern HTTP_DOMAIN_PATTERN = Pattern.compile("^(http|https)://([^/]+/.*|$)");
056
057    private static final Pattern RELATIVE_MOD_PATTERN = Pattern.compile(".*(^|/)\\.\\.($|/).*");
058
059    private static final Pattern NORMALIZE_FILE_URI = Pattern.compile("^file:/{2,3}");
060
061    private List<String> allowedList;
062
063    /**
064     * Validates that an external path is valid. The path must be an HTTP or file URI within the allow list of paths,
065     * be absolute, and contain no relative modifier.
066     *
067     * @param extPath external binary path to validate
068     * @throws ExternalMessageBodyException thrown if the path is invalid.
069     */
070    public void validate(final String extPath) throws ExternalMessageBodyException {
071        if (allowedList == null || allowedList.size() == 0) {
072            throw new ExternalMessageBodyException("External content is disallowed by the server");
073        }
074
075        if (isEmpty(extPath)) {
076            throw new ExternalMessageBodyException("External content path was empty");
077        }
078
079        final String path = normalizeUri(extPath);
080
081        final URI uri;
082        try {
083            // Ensure that the path is a valid URL
084            uri = new URI(path);
085            uri.toURL();
086        } catch (final Exception e) {
087            throw new ExternalMessageBodyException("Path was not a valid URI: " + extPath);
088        }
089
090        // Decode the uri and ensure that it does not contain modifiers
091        final String decodedPath = uri.getPath();
092        if (RELATIVE_MOD_PATTERN.matcher(decodedPath).matches()) {
093            throw new ExternalMessageBodyException("Path was not absolute: " + extPath);
094        }
095
096        // Require that the path is absolute
097        if (!uri.isAbsolute()) {
098            throw new ExternalMessageBodyException("Path was not absolute: " + extPath);
099        }
100
101        // Ensure that an accept scheme was provided
102        final String scheme = uri.getScheme();
103        if (!ALLOWED_SCHEMES.contains(scheme)) {
104            throw new ExternalMessageBodyException("Path did not provide an allowed scheme: " + extPath);
105        }
106
107        // If a file, verify that it exists
108        if (scheme.equals("file") && !Paths.get(uri).toFile().exists()) {
109            throw new ExternalMessageBodyException("Path did not match any allowed external content paths: " +
110                    extPath);
111        }
112
113        // Check that the uri is within an allowed path
114        if (allowedList.stream().anyMatch(allowed -> path.startsWith(allowed))) {
115            return;
116        }
117        throw new ExternalMessageBodyException("Path did not match any allowed external content paths: " + extPath);
118    }
119
120    private String normalizeUri(final String path) {
121        // lowercase the scheme since it is case insensitive
122        final String[] parts = path.split(":", 2);
123        final String normalized;
124        if (parts.length == 2) {
125            normalized = parts[0].toLowerCase() + ":" + parts[1];
126        } else {
127            return path;
128        }
129        // file uris can have between 1 and 3 slashes depending on if the authority is present
130        if (normalized.startsWith("file://")) {
131            return NORMALIZE_FILE_URI.matcher(normalized).replaceFirst("file:/");
132        }
133        return normalized;
134    }
135
136    /**
137     * Loads the allowed list.
138     *
139     * @throws IOException thrown if the allowed list configuration file cannot be read.
140     */
141    @Override
142    protected synchronized void loadConfiguration() throws IOException {
143        LOGGER.info("Loading list of allowed external content locations from {}", configPath);
144        try (final Stream<String> stream = Files.lines(Paths.get(configPath))) {
145            allowedList = stream.map(line -> normalizeUri(line.trim()))
146                    .filter(line -> isAllowanceValid(line))
147                    .collect(Collectors.toList());
148        }
149    }
150
151    private boolean isAllowanceValid(final String allowance) {
152        final Matcher schemeMatcher = SCHEME_PATTERN.matcher(allowance);
153        final boolean schemeMatches = schemeMatcher.matches();
154        if (!schemeMatches || RELATIVE_MOD_PATTERN.matcher(allowance).matches()) {
155            LOGGER.error("Invalid path {} specified in external path configuration {}",
156                    allowance, configPath);
157            return false;
158        }
159
160        final String protocol = schemeMatcher.group(1).toLowerCase();
161        if ("file".equals(protocol)) {
162            // If a file uri ends with / it must be a directory, otherwise it must be a file.
163            final File allowing = new File(URI.create(allowance).getPath());
164            if ((allowance.endsWith("/") && !allowing.isDirectory()) || (!allowance.endsWith("/") && !allowing
165                    .isFile())) {
166                LOGGER.error("Invalid path {} in configuration {}, directories must end with a '/'",
167                        allowance, configPath);
168                return false;
169            }
170        } else if ("http".equals(protocol) || "https".equals(protocol)) {
171            if (!HTTP_DOMAIN_PATTERN.matcher(allowance).matches()) {
172                LOGGER.error("Invalid path {} in configuration {}, domain must end with a '/'",
173                        allowance, configPath);
174                return false;
175            }
176        }
177        return true;
178    }
179}