Skip to content

Split text

SplitText

A utility class for splitting large text into smaller chunks using the RecursiveCharacterTextSplitter from the langchain_text_splitters library.

Attributes:

Name Type Description
chunk_size int

The maximum size of each text chunk. Defaults to 8000.

Source code in model/split_text.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class SplitText:
    """
    A utility class for splitting large text into smaller chunks using
    the RecursiveCharacterTextSplitter from the langchain_text_splitters library.

    Attributes:
        chunk_size (int): The maximum size of each text chunk. Defaults to 8000.
    """

    def __init__(self, chunk_size: int = 8000) -> None:
        """
        Initializes the SplitText instance with a specified chunk size.

        Args:
            chunk_size (int): The maximum size of each text chunk. Defaults to 8000.
        """

        self.chunk_size = chunk_size

    def split_text(self, content: str) -> List[str]:
        """
        Splits the given text into smaller chunks using RecursiveCharacterTextSplitter.

        Args:
            content (str): The text content to split into chunks.

        Returns:
            List[str]: A list of text chunks.
        """

        if not isinstance(content, str):
            raise ValueError("The 'content' parameter must be a string.")

        splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=self.chunk_size, chunk_overlap=0
        )
        return splitter.split_text(content)

__init__(chunk_size=8000)

Initializes the SplitText instance with a specified chunk size.

Parameters:

Name Type Description Default
chunk_size int

The maximum size of each text chunk. Defaults to 8000.

8000
Source code in model/split_text.py
13
14
15
16
17
18
19
20
21
def __init__(self, chunk_size: int = 8000) -> None:
    """
    Initializes the SplitText instance with a specified chunk size.

    Args:
        chunk_size (int): The maximum size of each text chunk. Defaults to 8000.
    """

    self.chunk_size = chunk_size

split_text(content)

Splits the given text into smaller chunks using RecursiveCharacterTextSplitter.

Parameters:

Name Type Description Default
content str

The text content to split into chunks.

required

Returns:

Type Description
List[str]

List[str]: A list of text chunks.

Source code in model/split_text.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def split_text(self, content: str) -> List[str]:
    """
    Splits the given text into smaller chunks using RecursiveCharacterTextSplitter.

    Args:
        content (str): The text content to split into chunks.

    Returns:
        List[str]: A list of text chunks.
    """

    if not isinstance(content, str):
        raise ValueError("The 'content' parameter must be a string.")

    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=self.chunk_size, chunk_overlap=0
    )
    return splitter.split_text(content)