| 1 | # 1. Set up your Cohere client, translation prompt and maximum words per chunk |
| 2 | import cohere |
| 3 | |
| 4 | co = cohere.ClientV2("<YOUR API KEY>") |
| 5 | model = "command-a-translate-08-2025" |
| 6 | |
| 7 | target_language = "Spanish" |
| 8 | prompt_template = "Translate everything that follows into {target_language}:\n\n" |
| 9 | max_words = 15 # Set your desired maximum number of words per chunk |
| 10 | |
| 11 | # 2. Your source text |
| 12 | text = ( |
| 13 | "Enterprises rely on translation for some of their most sensitive and business-critical documents and cannot risk data leakage, compliance violations, or misunderstandings. Mistranslated documents can reduce trust and have strategic implications." |
| 14 | ) |
| 15 | |
| 16 | |
| 17 | # 3. Define the chunk_split function (from earlier in your notebook) |
| 18 | def chunk_split(text, max_words, threshold=0.8): |
| 19 | |
| 20 | words = text.split() # Turn the text into a list of words |
| 21 | chunks = [] # Initialize an empty list to store our chunks |
| 22 | start = 0 # Starting index for slicing the words list |
| 23 | |
| 24 | while start < len(words): |
| 25 | # Determine the end index for the current chunk |
| 26 | end = min(start + max_words, len(words)) |
| 27 | chunk_words = words[start:end] |
| 28 | chunk_text = " ".join(chunk_words) # Combine words back into a string |
| 29 | |
| 30 | # If we're at the end of the text or the chunk is too short, add it as is |
| 31 | if end == len(words) or len(chunk_words) < max_words * threshold: |
| 32 | chunks.append(chunk_text.strip()) |
| 33 | break |
| 34 | |
| 35 | # Try to find a natural breaking point within the chunk |
| 36 | split_point = None |
| 37 | for separator in ["\n", ".", ")", " "]: |
| 38 | idx = chunk_text.rfind(separator) |
| 39 | if idx != -1 and idx >= len(chunk_text) * threshold: |
| 40 | split_point = idx + 1 # Position after the separator |
| 41 | break |
| 42 | |
| 43 | if split_point: |
| 44 | # If a good split point is found, add the chunk up to that point |
| 45 | chunks.append(chunk_text[:split_point].strip()) |
| 46 | # Move the start index forward by the number of words consumed |
| 47 | consumed = len(chunk_text[:split_point].split()) |
| 48 | start += consumed |
| 49 | else: |
| 50 | # If no good split point is found, add the entire chunk |
| 51 | chunks.append(chunk_text.strip()) |
| 52 | start = end # Move to the next chunk |
| 53 | |
| 54 | return chunks |
| 55 | |
| 56 | # 4. Split the text into chunks using chunk_split |
| 57 | chunks = chunk_split(text, max_words=max_words) |
| 58 | |
| 59 | # 5. Translate each chunk and collect results |
| 60 | translated_chunks = [] |
| 61 | for chunk in chunks: |
| 62 | prompt = prompt_template.format(target_language=target_language) + chunk |
| 63 | response = co.chat( |
| 64 | model=model, |
| 65 | messages=[{"role": "user", "content": prompt}], |
| 66 | ) |
| 67 | translated = response.message.content[0].text |
| 68 | translated_chunks.append(translated) |
| 69 | |
| 70 | # 6. Merge the translated chunks back together |
| 71 | translated_text = " ".join(translated_chunks) |
| 72 | |
| 73 | # 7. Output the final translation |
| 74 | print(translated_text) |