001/*
002 * Licensed to DuraSpace under one or more contributor license agreements.
003 * See the NOTICE file distributed with this work for additional information
004 * regarding copyright ownership.
005 *
006 * DuraSpace licenses this file to you under the Apache License,
007 * Version 2.0 (the "License"); you may not use this file except in
008 * compliance with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.fcrepo.http.api;
019
020import static org.apache.commons.lang3.StringUtils.isEmpty;
021import static org.slf4j.LoggerFactory.getLogger;
022
023import java.io.File;
024import java.io.IOException;
025import java.net.URI;
026import java.nio.file.Files;
027import java.nio.file.Paths;
028import java.util.Arrays;
029import java.util.HashSet;
030import java.util.List;
031import java.util.Set;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034import java.util.stream.Collectors;
035import java.util.stream.Stream;
036
037import org.fcrepo.kernel.api.exception.ExternalMessageBodyException;
038import org.fcrepo.kernel.api.utils.AutoReloadingConfiguration;
039import org.slf4j.Logger;
040
041/**
042 * Validates external content paths to ensure that they are within a configured allowed list of paths.
043 *
044 * @author bbpennel
045 */
046public class ExternalContentPathValidator extends AutoReloadingConfiguration {
047
048    private static final Logger LOGGER = getLogger(ExternalContentPathValidator.class);
049
050    private static final Set<String> ALLOWED_SCHEMES = new HashSet<>(Arrays.asList("file", "http", "https"));
051
052    private static final Pattern SCHEME_PATTERN = Pattern.compile("^(http|https|file):/.*");
053
054    // Pattern to check that an http uri contains a / after the domain if a domain is present
055    private static final Pattern HTTP_DOMAIN_PATTERN = Pattern.compile("^(http|https)://([^/]+/.*|$)");
056
057    private static final Pattern RELATIVE_MOD_PATTERN = Pattern.compile(".*(^|/)\\.\\.($|/).*");
058
059    private static final Pattern NORMALIZE_FILE_URI = Pattern.compile("^file:/{2,3}");
060
061    private List<String> allowedList;
062
063    /**
064     * Validates that an external path is valid. The path must be an HTTP or file URI within the allow list of paths,
065     * be absolute, and contain no relative modifier.
066     *
067     * @param extPath external binary path to validate
068     * @throws ExternalMessageBodyException thrown if the path is invalid.
069     */
070    public void validate(final String extPath) throws ExternalMessageBodyException {
071        if (allowedList == null || allowedList.size() == 0) {
072            throw new ExternalMessageBodyException("External content is disallowed by the server");
073        }
074
075        if (isEmpty(extPath)) {
076            throw new ExternalMessageBodyException("External content path was empty");
077        }
078
079        final String path = normalizePath(extPath.toLowerCase());
080
081        final URI uri;
082        try {
083            // Ensure that the path is a valid URL
084            uri = new URI(path);
085            uri.toURL();
086        } catch (final Exception e) {
087            throw new ExternalMessageBodyException("Path was not a valid URI: " + extPath);
088        }
089
090        // Decode the uri and ensure that it does not contain modifiers
091        final String decodedPath = uri.getPath();
092        if (RELATIVE_MOD_PATTERN.matcher(decodedPath).matches()) {
093            throw new ExternalMessageBodyException("Path was not absolute: " + extPath);
094        }
095
096        // Require that the path is absolute
097        if (!uri.isAbsolute()) {
098            throw new ExternalMessageBodyException("Path was not absolute: " + extPath);
099        }
100
101        // Ensure that an accept scheme was provided
102        final String scheme = uri.getScheme();
103        if (!ALLOWED_SCHEMES.contains(scheme)) {
104            throw new ExternalMessageBodyException("Path did not provide an allowed scheme: " + extPath);
105        }
106
107        // If a file, verify that it exists
108        if (scheme.equals("file") && !Paths.get(uri).toFile().exists()) {
109            throw new ExternalMessageBodyException("Path did not match any allowed external content paths: " +
110                    extPath);
111        }
112
113        // Check that the uri is within an allowed path
114        if (allowedList.stream().anyMatch(allowed -> path.startsWith(allowed))) {
115            return;
116        }
117        throw new ExternalMessageBodyException("Path did not match any allowed external content paths: " + extPath);
118    }
119
120    private String normalizePath(final String path) {
121        // file uris can have between 1 and 3 slashes depending on if the authority is present
122        if (path.startsWith("file://")) {
123            return NORMALIZE_FILE_URI.matcher(path).replaceFirst("file:/");
124        }
125        return path;
126    }
127
128    /**
129     * Loads the allowed list.
130     *
131     * @throws IOException thrown if the allowed list configuration file cannot be read.
132     */
133    @Override
134    protected synchronized void loadConfiguration() throws IOException {
135        LOGGER.info("Loading list of allowed external content locations from {}", configPath);
136        try (final Stream<String> stream = Files.lines(Paths.get(configPath))) {
137            allowedList = stream.map(line -> normalizePath(line.trim().toLowerCase()))
138                    .filter(line -> isAllowanceValid(line))
139                    .collect(Collectors.toList());
140        }
141    }
142
143    private boolean isAllowanceValid(final String allowance) {
144        final Matcher schemeMatcher = SCHEME_PATTERN.matcher(allowance);
145        final boolean schemeMatches = schemeMatcher.matches();
146        if (!schemeMatches || RELATIVE_MOD_PATTERN.matcher(allowance).matches()) {
147            LOGGER.error("Invalid path {} specified in external path configuration {}",
148                    allowance, configPath);
149            return false;
150        }
151
152        final String protocol = schemeMatcher.group(1);
153        if ("file".equals(protocol)) {
154            // If a file uri ends with / it must be a directory, otherwise it must be a file.
155            final File allowing = new File(URI.create(allowance).getPath());
156            if ((allowance.endsWith("/") && !allowing.isDirectory()) || (!allowance.endsWith("/") && !allowing
157                    .isFile())) {
158                LOGGER.error("Invalid path {} in configuration {}, directories must end with a '/'",
159                        allowance, configPath);
160                return false;
161            }
162        } else if ("http".equals(protocol) || "https".equals(protocol)) {
163            if (!HTTP_DOMAIN_PATTERN.matcher(allowance).matches()) {
164                LOGGER.error("Invalid path {} in configuration {}, domain must end with a '/'",
165                        allowance, configPath);
166                return false;
167            }
168        }
169        return true;
170    }
171}