­ ­ ­ ­ ­ ­ ­ ­ ­ ­ ­ ­ ­ ­ ­ ­ ­ ­ """ This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program.  If not, see . Copyright © 2019 Cloud Linux Software Inc. This software is also available under ImunifyAV commercial license, see """ import hashlib import logging import os import re from typing import Iterator, Optional from .config import MAX_FILE_SIZE logger = logging.getLogger(__name__) def dir_traverse_generator( target_dir: str, max_size: int = MAX_FILE_SIZE ) -> Iterator[str]: for root, dirs, files in os.walk(target_dir): for file in files: path = os.path.join(root, file) try: if os.path.getsize(path) <= max_size: yield os.path.join(root, file) except Exception as e: logger.warning( "Error occurred while processing file %s: %s", path, e ) def all_possible_relative_paths(abs_path: str, root_dir: str = "/") -> list: # accepts absolute file_path # returns list of all possible partial paths relative to root_dir # e.g., for (/a/b/c/d.txt, '/a/b/') it should return ['c/d.txt', 'd.txt'] rel_path = os.path.relpath(abs_path, root_dir or "/") if abs_path else "" path_parts = rel_path.strip(os.sep).split(os.sep) return [os.sep.join(path_parts[i:]) for i in range(len(path_parts))] def get_base_dir(abs_path: str, rel_path: str) -> str: # returns absolute path of base_dir such that os.path.join(base_dir, rel_path) == abs_path # e.g. get_base_dir('/a/b/c/d.txt', 'c/d.txt') should return '/a/b/' if not abs_path.endswith(rel_path): raise ValueError( f"rel_path '{rel_path}' is not a suffix of abs_path '{abs_path}'" ) return abs_path[: -len(rel_path)] class HashCalculator: HASHING_ALGORITHMS = { "md5": hashlib.md5, "sha256": hashlib.sha256, } BUFFER_SIZE = 8192 _consolidate_whitespace = re.compile(b"[\x20\x09-\x0d]+") _remove_control_characters = re.compile(b"[\x00-\x08\x0e-\x1f\x7f-\xff]+") def __init__(self, algorithm: str): self.algorithm = algorithm self._hasher_factory = self.HASHING_ALGORITHMS[algorithm] self._normalizer = ( self._normalize_data_for_md5 if algorithm == "md5" else lambda x: x ) @classmethod def _normalize_data_for_md5(cls, data: bytes) -> bytes: """ This method normalizes binary data by: - Removing the special control characters: 0x00-0x08, 0x0E-0x1F, 0x7F-0xFF; - 0x00-0x08, 0x0E-0x1F are ASCII control characters minus TAB, LF, VT, FF and CR; - 0x7F-0xFF are unicode control characters (DEL from C0 and C1 set). - Consolidating the clusters of whitespace characters (0x20, 0x09-0x0D) into single space character (0x20); - Turns all uppercase ASCII characters to lowercase. """ if not isinstance(data, bytes): raise TypeError("Normalization function expects bytes input") data = cls._remove_control_characters.sub(b"", data) data = cls._consolidate_whitespace.sub(b" ", data) return data.lower() def calc_hash( self, filepath: str, apply_normalization: bool = False ) -> Optional[str]: if not os.path.isfile(filepath): return with open(filepath, "rb") as file: normalized_data = ( self._normalizer(file.read()) if apply_normalization else file.read() ) hasher = self._hasher_factory() for chunk in ( normalized_data[i : i + self.BUFFER_SIZE] for i in range(0, len(normalized_data), self.BUFFER_SIZE) ): hasher.update(chunk) return hasher.hexdigest()