Skip to content

Tika

TikaParser

A class for parsing content from files and file bytes using Apache Tika.

Attributes:

Name Type Description
tika_server str

The endpoint for the Tika server. Defaults to 'http://localhost:8002/'.

Source code in model/tika.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class TikaParser:
    """
    A class for parsing content from files and file bytes using Apache Tika.

    Attributes:
        tika_server (str): The endpoint for the Tika server. Defaults to 'http://localhost:8002/'.
    """

    def __init__(self, tika_server: str = 'http://localhost:8002/') -> None:
        """
        Initializes the TikaParser instance with a Tika server endpoint and starts the Tika Java Virtual Machine.

        Args:
            tika_server (str): The endpoint for the Tika server. Defaults to 'http://localhost:8002/'.
        """
        self.tika_server = tika_server
        tika.initVM()

    def tika_parser_from_bytes(self, file_binary: bytes) -> str:
        """
        Parses the content of a file provided as bytes using the Tika server.

        Args:
            file_binary (bytes): The binary content of the file to parse.

        Returns:
            str: The parsed content of the file.
        """
        parsed = parser.from_buffer(string=file_binary, serverEndpoint=self.tika_server)
        self.content = parsed["content"]
        return self.content

    def tika_parser_from_file_path(self, file_path: str) -> str:
        """
        Parses the content of a file provided via file path using the Tika server.

        Args:
            file_path (str): The path to the file to parse.

        Returns:
            str: The parsed content of the file.
        """
        parsed = parser.from_file(filename=file_path, serverEndpoint=self.tika_server)
        self.content = parsed["content"]
        return self.content

    def hash_file_bytes(self, file_bytes: bytes) -> str:
        """
        Generates a SHA-256 hash for the given file bytes.

        Args:
            file_bytes (bytes): The binary content of the file to hash.

        Returns:
            str: The SHA-256 hash of the file content.
        """
        sha256_hash = hashlib.sha256()
        sha256_hash.update(file_bytes)
        return sha256_hash.hexdigest()

__init__(tika_server='http://localhost:8002/')

Initializes the TikaParser instance with a Tika server endpoint and starts the Tika Java Virtual Machine.

Parameters:

Name Type Description Default
tika_server str

The endpoint for the Tika server. Defaults to 'http://localhost:8002/'.

'http://localhost:8002/'
Source code in model/tika.py
13
14
15
16
17
18
19
20
21
def __init__(self, tika_server: str = 'http://localhost:8002/') -> None:
    """
    Initializes the TikaParser instance with a Tika server endpoint and starts the Tika Java Virtual Machine.

    Args:
        tika_server (str): The endpoint for the Tika server. Defaults to 'http://localhost:8002/'.
    """
    self.tika_server = tika_server
    tika.initVM()

hash_file_bytes(file_bytes)

Generates a SHA-256 hash for the given file bytes.

Parameters:

Name Type Description Default
file_bytes bytes

The binary content of the file to hash.

required

Returns:

Name Type Description
str str

The SHA-256 hash of the file content.

Source code in model/tika.py
51
52
53
54
55
56
57
58
59
60
61
62
63
def hash_file_bytes(self, file_bytes: bytes) -> str:
    """
    Generates a SHA-256 hash for the given file bytes.

    Args:
        file_bytes (bytes): The binary content of the file to hash.

    Returns:
        str: The SHA-256 hash of the file content.
    """
    sha256_hash = hashlib.sha256()
    sha256_hash.update(file_bytes)
    return sha256_hash.hexdigest()

tika_parser_from_bytes(file_binary)

Parses the content of a file provided as bytes using the Tika server.

Parameters:

Name Type Description Default
file_binary bytes

The binary content of the file to parse.

required

Returns:

Name Type Description
str str

The parsed content of the file.

Source code in model/tika.py
23
24
25
26
27
28
29
30
31
32
33
34
35
def tika_parser_from_bytes(self, file_binary: bytes) -> str:
    """
    Parses the content of a file provided as bytes using the Tika server.

    Args:
        file_binary (bytes): The binary content of the file to parse.

    Returns:
        str: The parsed content of the file.
    """
    parsed = parser.from_buffer(string=file_binary, serverEndpoint=self.tika_server)
    self.content = parsed["content"]
    return self.content

tika_parser_from_file_path(file_path)

Parses the content of a file provided via file path using the Tika server.

Parameters:

Name Type Description Default
file_path str

The path to the file to parse.

required

Returns:

Name Type Description
str str

The parsed content of the file.

Source code in model/tika.py
37
38
39
40
41
42
43
44
45
46
47
48
49
def tika_parser_from_file_path(self, file_path: str) -> str:
    """
    Parses the content of a file provided via file path using the Tika server.

    Args:
        file_path (str): The path to the file to parse.

    Returns:
        str: The parsed content of the file.
    """
    parsed = parser.from_file(filename=file_path, serverEndpoint=self.tika_server)
    self.content = parsed["content"]
    return self.content