<?php
/**
* OpenAI Tokenizer
*/
namespace Classifai\Providers\OpenAI;
class Tokenizer {
/**
* Maximum number of tokens our model supports
*
* @var int
*/
public $max_tokens;
/**
* How many characters in one token (roughly)
*
* @var float
*/
public $characters_in_token = 3.5;
/**
* How many tokens a word will take (roughly)
*
* @var float
*/
public $tokens_per_word = 1.5;
/**
* OpenAI Tokenizer constructor.
*
* @param int $max_tokens Maximum tokens the model supports.
*/
public function __construct( int $max_tokens ) {
$this->max_tokens = $max_tokens;
/**
* How many characters in one token (roughly)
*
* @since 2.4..0
* @hook classifai_openai_characters_in_token
*
* @param {int} $characters_in_token How many characters in one token (roughly)
* @param {int} $max_tokens Maximum tokens the model supports.
*
* @return {int}
*/
$this->characters_in_token = apply_filters( 'classifai_openai_characters_in_token', $this->characters_in_token, $max_tokens );
/**
* How many tokens a word will take (roughly)
*
* @since 2.4.0
* @hook classifai_openai_tokens_per_word
*
* @param {int} $tokens_per_word How many tokens a word will take (roughly)
* @param {int} $max_tokens Maximum tokens the model supports.
*
* @return {int}
*/
$this->tokens_per_word = apply_filters( 'classifai_openai_tokens_per_word', $this->tokens_per_word, $max_tokens );
}
/**
* Determine roughly how many tokens a string contains.
*
* @param string $content Content to analyze.
* @return int
*/
public function tokens_in_content( string $content = '' ): int {
$tokens = ceil( mb_strlen( $content ) / $this->characters_in_token );
return (int) $tokens;
}
/**
* Determine how many tokens are in a certain number of words.
*
* @param int $words Number of words we want.
* @return int
*/
public function tokens_in_words( int $words = 1 ): int {
$tokens = ceil( $this->tokens_per_word * absint( $words ) );
return (int) $tokens;
}
/**
* Trim our content, if needed, to be under our max token number.
*
* @param string $content Content to trim.
* @param int $max_tokens Maximum tokens our content can have.
* @return string
*/
public function trim_content( string $content = '', int $max_tokens = 0 ): string {
// Remove linebreaks that may have been added.
$content = str_replace( "\n\n", ' ', $content );
// Determine how many tokens the content has.
$content_tokens = $this->tokens_in_content( $content );
// If we don't need to trim, return full content.
if ( $content_tokens < $max_tokens ) {
return $content;
}
/**
* Next we determine how many tokens we need to trim by taking the
* number of tokens in the content and subtracting the max tokens
* we can have.
*
* Then we convert that token number to characters.
*
* Finally we determine what the max character length our content
* can be and trim it up.
*/
$tokens_to_trim = $content_tokens - $max_tokens;
$characters_to_trim = (int) ceil( $tokens_to_trim * $this->characters_in_token );
$max_content_length = mb_strlen( $content ) - $characters_to_trim;
$trimmed_content = mb_substr( $content, 0, $max_content_length );
// Ensure we our final string ends on a full word instead of truncating in the middle.
if ( ! preg_match( '/\\W/u', mb_substr( $content, $max_content_length - 1, 2 ) ) ) {
if ( preg_match( '/.*\\W/u', $trimmed_content, $matches ) ) {
$trimmed_content = $matches[0];
}
}
return trim( $trimmed_content );
}
}