# Sample text to chunk text = "This is the text I would like to chunk up. It is the example text for this exercise."
# Set the chunk size chunk_size = 35 # Initialize a list to hold the chunks chunks = [] # Iterate over the text to create chunks for i inrange(0, len(text), chunk_size): chunk = text[i:i + chunk_size] chunks.append(chunk) # Display the chunks print(chunks) # Output: ['This is the text I would like to ch', 'unk up. It is the example text for ', 'this exercise']
使用 LangChain 的 CharacterTextSplitter 实现相同的结果:
1 2 3 4 5 6 7 8 9 10 11 12 13
from langchain.text_splitter import CharacterTextSplitter
# Initialize the text splitter with specified chunk size text_splitter = CharacterTextSplitter(chunk_size=35, chunk_overlap=0, separator='', strip_whitespace=False) # Create documents using the text splitter documents = text_splitter.create_documents([text]) # Display the created documents for doc in documents: print(doc.page_content) # Output: # This is the text I would like to ch # unk up. It is the example text for # this exercise
from langchain_text_splitters import RecursiveCharacterTextSplitter # Sample text to chunk text = """ The Olympic Games, originally held in ancient Greece, were revived in 1896 and have since become the world’s foremost sports competition, bringing together athletes from around the globe. """ # Initialize the recursive character text splitter with specified chunk size text_splitter = RecursiveCharacterTextSplitter( # Set a really small chunk size, just to show. chunk_size=30, chunk_overlap=20, length_function=len, is_separator_regex=False, )
# Create documents using the text splitter documents = text_splitter.create_documents([text]) # Display the created documents for doc in documents: print(doc.page_content) # Output: # “The Olympic Games, originally” # “held in ancient Greece, were” # “revived in 1896 and have” # “have since become the world’s” # “world’s foremost sports” # “competition, bringing together” # “together athletes from around” # “around the globe.”
from langchain.text_splitter import PythonCodeTextSplitter # Sample Python code python_text = """ class Person: def __init__(self, name, age): self.name = name self.age = age p1 = Person("John", 36) for i in range(10): print(i) """ # Initialize the Python code text splitter python_splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=0) # Create documents using the text splitter documents = python_splitter.create_documents([python_text]) # Display the created documents for doc in documents: print(doc.page_content) # Output: # class Person:\n def __init__(self, name, age):\n self.name = name\n self.age = age # p1 = Person("John", 36)\n\nfor i in range(10):\n print(i)
from sklearn.metrics.pairwise import cosine_similarity from langchain.embeddings import OpenAIEmbeddings import re # Sample text text = """ One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear. Teachers and coaches implicitly told us the returns were linear. "You get out," I heard a thousand times, "what you put in." They meant well, but this is rarely true. If your product is only half as good as your competitor's, you don't get half as many customers. You get no customers, and you go out of business. It's obviously true that the returns for performance are superlinear in business. Some think this is a flaw of capitalism, and that if we changed the rules it would stop being true. But superlinear returns for performance are a feature of the world, not an artifact of rules we've invented. We see the same pattern in fame, power, military victories, knowledge, and even benefit to humanity. In all of these, the rich get richer. """ # Splitting the text into sentences sentences = re.split(r'(?<=[.?!])\s+', text) sentences = [{'sentence': x, 'index' : i} for i, x inenumerate(sentences)] # Combine sentences for context defcombine_sentences(sentences, buffer_size=1): for i inrange(len(sentences)): combined_sentence = '' for j inrange(i - buffer_size, i): if j >= 0: combined_sentence += sentences[j]['sentence'] + ' ' combined_sentence += sentences[i]['sentence'] for j inrange(i + 1, i + 1 + buffer_size): if j < len(sentences): combined_sentence += ' ' + sentences[j]['sentence'] sentences[i]['combined_sentence'] = combined_sentence return sentences sentences = combine_sentences(sentences) # Generate embeddings oai_embeds = OpenAIEmbeddings() embeddings = oai_embeds.embed_documents([x['combined_sentence'] for x in sentences]) # Add embeddings to sentences for i, sentence inenumerate(sentences): sentence['combined_sentence_embedding'] = embeddings[i] # Calculate cosine distances defcalculate_cosine_distances(sentences): distances = [] for i inrange(len(sentences) - 1): embedding_current = sentences[i]['combined_sentence_embedding'] embedding_next = sentences[i + 1]['combined_sentence_embedding'] similarity = cosine_similarity([embedding_current], [embedding_next])[0][0] distance = 1 - similarity distances.append(distance) sentences[i]['distance_to_next'] = distance return distances, sentences distances, sentences = calculate_cosine_distances(sentences) # Determine breakpoints and create chunks import numpy as np breakpoint_distance_threshold = np.percentile(distances, 95) indices_above_thresh = [i for i, x inenumerate(distances) if x > breakpoint_distance_threshold] # Combine sentences into chunks chunks = [] start_index = 0 for index in indices_above_thresh: end_index = index group = sentences[start_index:end_index + 1] combined_text = ' '.join([d['sentence'] for d in group]) chunks.append(combined_text) start_index = index + 1 if start_index < len(sentences): combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]]) chunks.append(combined_text) # Display the created chunks for i, chunk inenumerate(chunks): print(f"Chunk #{i+1}:\n{chunk}\n")
# Step 3: LLM Decision Node decision_node = LLMDecisionNode( input=splitter_node.output, prompt_template="Does the sentence '{next_sentence}' belong to the same chunk as '{current_chunk}'?", name="LLM Decision" )