"""
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License,
or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Copyright © 2019 Cloud Linux Software Inc.
This software is also available under ImunifyAV commercial license,
see
"""
import hashlib
import logging
import os
import re
from typing import Iterator, Optional
from .config import MAX_FILE_SIZE
logger = logging.getLogger(__name__)
def dir_traverse_generator(
target_dir: str, max_size: int = MAX_FILE_SIZE
) -> Iterator[str]:
for root, dirs, files in os.walk(target_dir):
for file in files:
path = os.path.join(root, file)
try:
if os.path.getsize(path) <= max_size:
yield os.path.join(root, file)
except Exception as e:
logger.warning(
"Error occurred while processing file %s: %s", path, e
)
def all_possible_relative_paths(abs_path: str, root_dir: str = "/") -> list:
# accepts absolute file_path
# returns list of all possible partial paths relative to root_dir
# e.g., for (/a/b/c/d.txt, '/a/b/') it should return ['c/d.txt', 'd.txt']
rel_path = os.path.relpath(abs_path, root_dir or "/") if abs_path else ""
path_parts = rel_path.strip(os.sep).split(os.sep)
return [os.sep.join(path_parts[i:]) for i in range(len(path_parts))]
def get_base_dir(abs_path: str, rel_path: str) -> str:
# returns absolute path of base_dir such that os.path.join(base_dir, rel_path) == abs_path
# e.g. get_base_dir('/a/b/c/d.txt', 'c/d.txt') should return '/a/b/'
if not abs_path.endswith(rel_path):
raise ValueError(
f"rel_path '{rel_path}' is not a suffix of abs_path '{abs_path}'"
)
return abs_path[: -len(rel_path)]
class HashCalculator:
HASHING_ALGORITHMS = {
"md5": hashlib.md5,
"sha256": hashlib.sha256,
}
BUFFER_SIZE = 8192
_consolidate_whitespace = re.compile(b"[\x20\x09-\x0d]+")
_remove_control_characters = re.compile(b"[\x00-\x08\x0e-\x1f\x7f-\xff]+")
def __init__(self, algorithm: str):
self.algorithm = algorithm
self._hasher_factory = self.HASHING_ALGORITHMS[algorithm]
self._normalizer = (
self._normalize_data_for_md5 if algorithm == "md5" else lambda x: x
)
@classmethod
def _normalize_data_for_md5(cls, data: bytes) -> bytes:
"""
This method normalizes binary data by:
- Removing the special control characters: 0x00-0x08, 0x0E-0x1F, 0x7F-0xFF;
- 0x00-0x08, 0x0E-0x1F are ASCII control characters minus TAB, LF, VT, FF and CR;
- 0x7F-0xFF are unicode control characters (DEL from C0 and C1 set).
- Consolidating the clusters of whitespace characters (0x20, 0x09-0x0D) into single space character (0x20);
- Turns all uppercase ASCII characters to lowercase.
"""
if not isinstance(data, bytes):
raise TypeError("Normalization function expects bytes input")
data = cls._remove_control_characters.sub(b"", data)
data = cls._consolidate_whitespace.sub(b" ", data)
return data.lower()
def calc_hash(
self, filepath: str, apply_normalization: bool = False
) -> Optional[str]:
if not os.path.isfile(filepath):
return
with open(filepath, "rb") as file:
normalized_data = (
self._normalizer(file.read())
if apply_normalization
else file.read()
)
hasher = self._hasher_factory()
for chunk in (
normalized_data[i : i + self.BUFFER_SIZE]
for i in range(0, len(normalized_data), self.BUFFER_SIZE)
):
hasher.update(chunk)
return hasher.hexdigest()