­­­­­­­­­­­­­­­­­­
<!DOCTYPE html>
<html>
"""
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License,
or (at your option) any later version.


This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 
See the GNU General Public License for more details.


You should have received a copy of the GNU General Public License
 along with this program.  If not, see <https://www.gnu.org/licenses/>.

Copyright © 2019 Cloud Linux Software Inc.

This software is also available under ImunifyAV commercial license,
see <https://www.imunify360.com/legal/eula>
"""
import hashlib
import logging
import os
import re
from typing import Iterator, Optional

from .config import MAX_FILE_SIZE

logger = logging.getLogger(__name__)


def dir_traverse_generator(
    target_dir: str, max_size: int = MAX_FILE_SIZE
) -> Iterator[str]:
    for root, dirs, files in os.walk(target_dir):
        for file in files:
            path = os.path.join(root, file)
            try:
                if os.path.getsize(path) <= max_size:
                    yield os.path.join(root, file)
            except Exception as e:
                logger.warning(
                    "Error occurred while processing file %s: %s", path, e
                )


def all_possible_relative_paths(abs_path: str, root_dir: str = "/") -> list:
    # accepts absolute file_path
    # returns list of all possible partial paths relative to root_dir
    # e.g., for (/a/b/c/d.txt, '/a/b/') it should return ['c/d.txt', 'd.txt']
    rel_path = os.path.relpath(abs_path, root_dir or "/") if abs_path else ""
    path_parts = rel_path.strip(os.sep).split(os.sep)
    return [os.sep.join(path_parts[i:]) for i in range(len(path_parts))]


def get_base_dir(abs_path: str, rel_path: str) -> str:
    # returns absolute path of base_dir such that os.path.join(base_dir, rel_path) == abs_path
    # e.g. get_base_dir('/a/b/c/d.txt', 'c/d.txt') should return '/a/b/'
    if not abs_path.endswith(rel_path):
        raise ValueError(
            f"rel_path '{rel_path}' is not a suffix of abs_path '{abs_path}'"
        )
    return abs_path[: -len(rel_path)]


class HashCalculator:
    HASHING_ALGORITHMS = {
        "md5": hashlib.md5,
        "sha256": hashlib.sha256,
    }
    BUFFER_SIZE = 8192
    _consolidate_whitespace = re.compile(b"[\x20\x09-\x0d]+")
    _remove_control_characters = re.compile(b"[\x00-\x08\x0e-\x1f\x7f-\xff]+")

    def __init__(self, algorithm: str):
        self.algorithm = algorithm
        self._hasher_factory = self.HASHING_ALGORITHMS[algorithm]
        self._normalizer = (
            self._normalize_data_for_md5 if algorithm == "md5" else lambda x: x
        )

    @classmethod
    def _normalize_data_for_md5(cls, data: bytes) -> bytes:
        """
        This method normalizes binary data by:
        - Removing the special control characters: 0x00-0x08, 0x0E-0x1F, 0x7F-0xFF;
          - 0x00-0x08, 0x0E-0x1F are ASCII control characters minus TAB, LF, VT, FF and CR;
          - 0x7F-0xFF are unicode control characters (DEL from C0 and C1 set).
        - Consolidating the clusters of whitespace characters (0x20, 0x09-0x0D) into single space character (0x20);
        - Turns all uppercase ASCII characters to lowercase.
        """
        if not isinstance(data, bytes):
            raise TypeError("Normalization function expects bytes input")

        data = cls._remove_control_characters.sub(b"", data)
        data = cls._consolidate_whitespace.sub(b" ", data)
        return data.lower()

    def calc_hash(
        self, filepath: str, apply_normalization: bool = False
    ) -> Optional[str]:
        if not os.path.isfile(filepath):
            return
        with open(filepath, "rb") as file:
            normalized_data = (
                self._normalizer(file.read())
                if apply_normalization
                else file.read()
            )
            hasher = self._hasher_factory()
            for chunk in (
                normalized_data[i : i + self.BUFFER_SIZE]
                for i in range(0, len(normalized_data), self.BUFFER_SIZE)
            ):
                hasher.update(chunk)
        return hasher.hexdigest()