Source: Providers/OpenAI/Embeddings.php

  1. <?php
  2. /**
  3. * OpenAI Embeddings integration
  4. */
  5. namespace Classifai\Providers\OpenAI;
  6. use Classifai\Admin\Notifications;
  7. use Classifai\Providers\Provider;
  8. use Classifai\Providers\OpenAI\APIRequest;
  9. use Classifai\Providers\OpenAI\EmbeddingCalculations;
  10. use Classifai\Normalizer;
  11. use Classifai\Features\Classification;
  12. use Classifai\Features\Feature;
  13. use Classifai\EmbeddingsScheduler;
  14. use WP_Error;
  15. use function Classifai\should_use_legacy_settings_panel;
  16. class Embeddings extends Provider {
  17. use \Classifai\Providers\OpenAI\OpenAI;
  18. const ID = 'openai_embeddings';
  19. /**
  20. * OpenAI Embeddings URL.
  21. *
  22. * @var string
  23. */
  24. protected $api_url = 'https://api.openai.com/v1/embeddings';
  25. /**
  26. * OpenAI Embeddings model.
  27. *
  28. * @var string
  29. */
  30. protected $model = 'text-embedding-3-small';
  31. /**
  32. * Maximum number of tokens our model supports.
  33. *
  34. * @var int
  35. */
  36. protected $max_tokens = 8191;
  37. /**
  38. * Number of dimensions for the embeddings.
  39. *
  40. * @var int
  41. */
  42. protected $dimensions = 512;
  43. /**
  44. * Maximum number of terms we process.
  45. *
  46. * @var int
  47. */
  48. protected $max_terms = 5000;
  49. /**
  50. * NLU features that are supported by this provider.
  51. *
  52. * @var array
  53. */
  54. public $nlu_features = [];
  55. /**
  56. * Scheduler instance.
  57. *
  58. * @var EmbeddingsScheduler|null
  59. */
  60. private static $scheduler_instance = null;
  61. /**
  62. * OpenAI Embeddings constructor.
  63. *
  64. * @param \Classifai\Features\Feature $feature_instance The feature instance.
  65. */
  66. public function __construct( $feature_instance = null ) {
  67. $this->feature_instance = $feature_instance;
  68. if (
  69. $this->feature_instance &&
  70. method_exists( $this->feature_instance, 'get_supported_taxonomies' )
  71. ) {
  72. $settings = get_option( $this->feature_instance->get_option_name(), [] );
  73. $post_types = isset( $settings['post_types'] ) ? $settings['post_types'] : [ 'post' => 1 ];
  74. foreach ( $this->feature_instance->get_supported_taxonomies( $post_types ) as $tax => $label ) {
  75. $this->nlu_features[ $tax ] = [
  76. 'feature' => $label,
  77. 'threshold' => __( 'Threshold (%)', 'classifai' ),
  78. 'threshold_default' => 75,
  79. 'taxonomy' => __( 'Taxonomy', 'classifai' ),
  80. 'taxonomy_default' => $tax,
  81. ];
  82. }
  83. }
  84. }
  85. /**
  86. * Get the API URL.
  87. *
  88. * @return string
  89. */
  90. public function get_api_url(): string {
  91. /**
  92. * Filter the API URL.
  93. *
  94. * @since 3.1.0
  95. * @hook classifai_openai_embeddings_api_url
  96. *
  97. * @param {string} $url The default API URL.
  98. *
  99. * @return {string} The API URL.
  100. */
  101. return apply_filters( 'classifai_openai_embeddings_api_url', $this->api_url );
  102. }
  103. /**
  104. * Get the model name.
  105. *
  106. * @return string
  107. */
  108. public function get_model(): string {
  109. /**
  110. * Filter the model name.
  111. *
  112. * Useful if you want to use a different model, like
  113. * text-embedding-3-large.
  114. *
  115. * @since 3.1.0
  116. * @hook classifai_openai_embeddings_model
  117. *
  118. * @param {string} $model The default model to use.
  119. *
  120. * @return {string} The model to use.
  121. */
  122. return apply_filters( 'classifai_openai_embeddings_model', $this->model );
  123. }
  124. /**
  125. * Get the number of dimensions for the embeddings.
  126. *
  127. * @return int
  128. */
  129. public function get_dimensions(): int {
  130. /**
  131. * Filter the dimensions we want for each embedding.
  132. *
  133. * Useful if you want to increase or decrease the length
  134. * of each embedding.
  135. *
  136. * @since 3.1.0
  137. * @hook classifai_openai_embeddings_dimensions
  138. *
  139. * @param {int} $dimensions The default dimensions.
  140. *
  141. * @return {int} The dimensions.
  142. */
  143. return apply_filters( 'classifai_openai_embeddings_dimensions', $this->dimensions );
  144. }
  145. /**
  146. * Get the maximum number of tokens.
  147. *
  148. * @return int
  149. */
  150. public function get_max_tokens(): int {
  151. /**
  152. * Filter the max number of tokens.
  153. *
  154. * Useful if you want to change to a different model
  155. * that uses a different number of tokens, or be more
  156. * strict on the amount of tokens that can be used.
  157. *
  158. * @since 3.1.0
  159. * @hook classifai_openai_embeddings_max_tokens
  160. *
  161. * @param {int} $model The default maximum tokens.
  162. *
  163. * @return {int} The maximum tokens.
  164. */
  165. return apply_filters( 'classifai_openai_embeddings_max_tokens', $this->max_tokens );
  166. }
  167. /**
  168. * Get the maximum number of terms we process.
  169. *
  170. * @return int
  171. */
  172. public function get_max_terms(): int {
  173. /**
  174. * Filter the max number of terms.
  175. *
  176. * Default for this is 5000 but this filter can be used to change
  177. * this, either decreasing to help with performance or increasing
  178. * to ensure we consider more terms.
  179. *
  180. * @since 3.1.0
  181. * @hook classifai_openai_embeddings_max_terms
  182. *
  183. * @param {int} $terms The default maximum terms.
  184. *
  185. * @return {int} The maximum terms.
  186. */
  187. return apply_filters( 'classifai_openai_embeddings_max_terms', $this->max_terms );
  188. }
  189. /**
  190. * Render the provider fields.
  191. */
  192. public function render_provider_fields() {
  193. $settings = $this->feature_instance->get_settings( static::ID );
  194. add_settings_field(
  195. static::ID . '_api_key',
  196. esc_html__( 'API Key', 'classifai' ),
  197. [ $this->feature_instance, 'render_input' ],
  198. $this->feature_instance->get_option_name(),
  199. $this->feature_instance->get_option_name() . '_section',
  200. [
  201. 'option_index' => static::ID,
  202. 'label_for' => 'api_key',
  203. 'input_type' => 'password',
  204. 'default_value' => $settings['api_key'],
  205. 'class' => 'classifai-provider-field hidden provider-scope-' . static::ID, // Important to add this.
  206. 'description' => $this->feature_instance->is_configured_with_provider( static::ID ) ?
  207. '' :
  208. sprintf(
  209. wp_kses(
  210. /* translators: %1$s is replaced with the OpenAI sign up URL */
  211. __( 'Don\'t have an OpenAI account yet? <a title="Sign up for an OpenAI account" href="%1$s">Sign up for one</a> in order to get your API key.', 'classifai' ),
  212. [
  213. 'a' => [
  214. 'href' => [],
  215. 'title' => [],
  216. ],
  217. ]
  218. ),
  219. esc_url( 'https://platform.openai.com/signup' )
  220. ),
  221. ]
  222. );
  223. do_action( 'classifai_' . static::ID . '_render_provider_fields', $this );
  224. }
  225. /**
  226. * Returns the default settings for this provider.
  227. *
  228. * @return array
  229. */
  230. public function get_default_provider_settings(): array {
  231. $common_settings = [
  232. 'api_key' => '',
  233. 'authenticated' => false,
  234. ];
  235. return $common_settings;
  236. }
  237. /**
  238. * Register what we need for the plugin.
  239. *
  240. * This only fires if can_register returns true.
  241. */
  242. public function register() {
  243. add_filter( 'classifai_feature_classification_get_default_settings', [ $this, 'modify_default_feature_settings' ], 10, 2 );
  244. $feature = new Classification();
  245. self::$scheduler_instance = new EmbeddingsScheduler(
  246. 'classifai_schedule_generate_embedding_job',
  247. __( 'OpenAI Embeddings', 'classifai' )
  248. );
  249. self::$scheduler_instance->init();
  250. add_action( 'classifai_schedule_generate_embedding_job', [ $this, 'generate_embedding_job' ], 10, 4 );
  251. if (
  252. ! $feature->is_feature_enabled() ||
  253. $feature->get_feature_provider_instance()::ID !== static::ID
  254. ) {
  255. return;
  256. }
  257. add_action( 'created_term', [ $this, 'generate_embeddings_for_term' ] ); /** @phpstan-ignore return.void (function is used in multiple contexts and needs to return data if called directly) */
  258. add_action( 'edited_terms', [ $this, 'generate_embeddings_for_term' ] ); /** @phpstan-ignore return.void (function is used in multiple contexts and needs to return data if called directly) */
  259. add_action( 'wp_ajax_get_post_classifier_embeddings_preview_data', array( $this, 'get_post_classifier_embeddings_preview_data' ) );
  260. add_action( 'admin_post_classifai_regen_embeddings', [ $this, 'classifai_regen_embeddings' ] );
  261. }
  262. /**
  263. * Modify the default settings for the classification feature.
  264. *
  265. * @param array $settings Current settings.
  266. * @param Feature $feature_instance The feature instance.
  267. * @return array
  268. */
  269. public function modify_default_feature_settings( array $settings, $feature_instance ): array {
  270. remove_filter( 'classifai_feature_classification_get_default_settings', [ $this, 'modify_default_feature_settings' ], 10 );
  271. if ( $feature_instance->get_settings( 'provider' ) !== static::ID ) {
  272. return $settings;
  273. }
  274. add_filter( 'classifai_feature_classification_get_default_settings', [ $this, 'modify_default_feature_settings' ], 10, 2 );
  275. $defaults = [];
  276. foreach ( array_keys( $feature_instance->get_supported_taxonomies() ) as $tax ) {
  277. $enabled = 'category' === $tax ? true : false;
  278. $defaults[ $tax ] = $enabled;
  279. $defaults[ $tax . '_threshold' ] = 75;
  280. $defaults[ $tax . '_taxonomy' ] = $tax;
  281. }
  282. return array_merge( $settings, $defaults );
  283. }
  284. /**
  285. * Sanitization for the options being saved.
  286. *
  287. * @param array $new_settings Array of settings about to be saved.
  288. * @return array The sanitized settings to be saved.
  289. */
  290. public function sanitize_settings( array $new_settings ): array {
  291. $settings = $this->feature_instance->get_settings();
  292. $api_key_settings = $this->sanitize_api_key_settings( $new_settings, $settings );
  293. $new_settings[ static::ID ]['api_key'] = $api_key_settings[ static::ID ]['api_key'];
  294. $new_settings[ static::ID ]['authenticated'] = $api_key_settings[ static::ID ]['authenticated'];
  295. // Trigger embedding generation for all terms in enabled taxonomies if the feature is on.
  296. if ( $new_settings[ static::ID ]['authenticated'] && isset( $new_settings['status'] ) && 1 === (int) $new_settings['status'] ) {
  297. foreach ( array_keys( $this->nlu_features ) as $feature_name ) {
  298. if ( isset( $new_settings[ $feature_name ] ) && 1 === (int) $new_settings[ $feature_name ] ) {
  299. $this->trigger_taxonomy_update( $feature_name );
  300. }
  301. }
  302. }
  303. return $new_settings;
  304. }
  305. /**
  306. * Get the threshold for the similarity calculation.
  307. *
  308. * @since 2.5.0
  309. *
  310. * @param string $taxonomy Taxonomy slug.
  311. * @return float
  312. */
  313. public function get_threshold( string $taxonomy = '' ): float {
  314. $settings = ( new Classification() )->get_settings();
  315. $threshold = 1;
  316. if ( ! empty( $taxonomy ) ) {
  317. $threshold = isset( $settings[ $taxonomy . '_threshold' ] ) ? $settings[ $taxonomy . '_threshold' ] : 75;
  318. }
  319. // Convert $threshold (%) to decimal.
  320. $threshold = 1 - ( (float) $threshold / 100 );
  321. /**
  322. * Filter the threshold for the similarity calculation.
  323. *
  324. * @since 2.5.0
  325. * @hook classifai_threshold
  326. *
  327. * @param {float} $threshold The threshold to use.
  328. * @param {string} $taxonomy The taxonomy to get the threshold for.
  329. *
  330. * @return {float} The threshold to use.
  331. */
  332. return apply_filters( 'classifai_threshold', $threshold, $taxonomy );
  333. }
  334. /**
  335. * Regenerate embeddings.
  336. *
  337. * This will regenerate embeddings for all terms
  338. * and delete existing post embeddings. Useful to run
  339. * anytime the model or dimensions are changed.
  340. */
  341. public function regenerate_embeddings() {
  342. $feature = new Classification();
  343. $settings = $feature->get_settings();
  344. if (
  345. ! $feature->is_feature_enabled() ||
  346. $feature->get_feature_provider_instance()::ID !== static::ID
  347. ) {
  348. return;
  349. }
  350. // Regenerate embeddings for all terms.
  351. foreach ( array_keys( $this->nlu_features ) as $feature_name ) {
  352. if ( isset( $settings[ $feature_name ] ) && 1 === (int) $settings[ $feature_name ] ) {
  353. $this->trigger_taxonomy_update( $feature_name, true );
  354. }
  355. }
  356. // Delete all post embeddings.
  357. $embedding_posts = get_posts(
  358. [
  359. 'post_type' => 'any',
  360. 'posts_per_page' => -1, // phpcs:ignore WordPress.WP.PostsPerPageNoUnlimited.posts_per_page_posts_per_page
  361. 'fields' => 'ids',
  362. 'meta_key' => 'classifai_openai_embeddings', // phpcs:ignore WordPress.DB.SlowDBQuery.slow_db_query_meta_key
  363. 'meta_compare' => 'EXISTS',
  364. ]
  365. );
  366. foreach ( $embedding_posts as $post_id ) {
  367. delete_post_meta( $post_id, 'classifai_openai_embeddings' );
  368. }
  369. // Hide the admin notice.
  370. update_option( 'classifai_hide_embeddings_notice', true, false );
  371. // Set a notice to let the user know the embeddings have been regenerated.
  372. $notifications = new Notifications();
  373. $notifications->set_notice(
  374. esc_html__( 'Embeddings have been regenerated.', 'classifai' ),
  375. 'success',
  376. );
  377. // Redirect to the same page but remove the nonce so we don't run this again.
  378. $redirect_url = admin_url( 'tools.php?page=classifai#/language_processing/feature_classification' );
  379. if ( should_use_legacy_settings_panel() ) {
  380. $redirect_url = admin_url( 'tools.php?page=classifai&tab=language_processing&feature=feature_classification' );
  381. }
  382. wp_safe_redirect( $redirect_url );
  383. exit;
  384. }
  385. /**
  386. * Get the data to preview terms.
  387. *
  388. * @since 2.5.0
  389. */
  390. public function get_post_classifier_embeddings_preview_data() {
  391. $nonce = isset( $_POST['nonce'] ) ? sanitize_text_field( wp_unslash( $_POST['nonce'] ) ) : false;
  392. if ( ! $nonce || ! wp_verify_nonce( $nonce, 'classifai-previewer-action' ) ) {
  393. wp_send_json_error( esc_html__( 'Failed nonce check.', 'classifai' ) );
  394. }
  395. $post_id = filter_input( INPUT_POST, 'post_id', FILTER_SANITIZE_NUMBER_INT );
  396. $embeddings = $this->generate_embeddings_for_post( $post_id, true );
  397. $embeddings_terms = [];
  398. // Add terms to this item based on embedding data.
  399. if ( $embeddings && ! is_wp_error( $embeddings ) ) {
  400. $embeddings_terms = $this->get_terms( $embeddings );
  401. if ( is_wp_error( $embeddings_terms ) ) {
  402. wp_send_json_error( $embeddings_terms->get_error_message() );
  403. }
  404. }
  405. wp_send_json_success( $embeddings_terms );
  406. }
  407. /**
  408. * Regenerate embeddings.
  409. */
  410. public function classifai_regen_embeddings() {
  411. if (
  412. ! isset( $_GET['embeddings_nonce'] ) ||
  413. ! wp_verify_nonce( sanitize_text_field( wp_unslash( $_GET['embeddings_nonce'] ) ), 'regen_embeddings' )
  414. ) {
  415. wp_die( esc_html__( 'You do not have permission to perform this operation.', 'classifai' ) );
  416. }
  417. $this->regenerate_embeddings();
  418. }
  419. /**
  420. * Trigger embedding generation for content being saved.
  421. *
  422. * @param int $post_id ID of post being saved.
  423. * @param bool $force Whether to force generation of embeddings even if they already exist. Default false.
  424. * @return array|WP_Error
  425. */
  426. public function generate_embeddings_for_post( int $post_id, bool $force = false ) {
  427. // Don't run on autosaves.
  428. if ( defined( 'DOING_AUTOSAVE' ) && DOING_AUTOSAVE ) {
  429. return new WP_Error( 'invalid', esc_html__( 'Classification will not work during an autosave.', 'classifai' ) );
  430. }
  431. // Ensure the user has permissions to edit.
  432. if ( ! current_user_can( 'edit_post', $post_id ) && ( ! defined( 'WP_CLI' ) || ! WP_CLI ) ) {
  433. return new WP_Error( 'invalid', esc_html__( 'User does not have permission to classify this item.', 'classifai' ) );
  434. }
  435. /**
  436. * Filter whether ClassifAI should classify an item.
  437. *
  438. * Default is true, return false to skip classifying.
  439. *
  440. * @since 2.2.0
  441. * @hook classifai_openai_embeddings_should_classify
  442. *
  443. * @param {bool} $should_classify Whether the item should be classified. Default `true`, return `false` to skip.
  444. * @param {int} $id The ID of the item to be considered for classification.
  445. * @param {string} $type The type of item to be considered for classification.
  446. *
  447. * @return {bool} Whether the item should be classified.
  448. */
  449. if ( ! apply_filters( 'classifai_openai_embeddings_should_classify', true, $post_id, 'post' ) ) {
  450. return new WP_Error( 'invalid', esc_html__( 'Classification is disabled for this item.', 'classifai' ) );
  451. }
  452. // Try to use the stored embeddings first.
  453. if ( ! $force ) {
  454. $embeddings = get_post_meta( $post_id, 'classifai_openai_embeddings', true );
  455. if ( ! empty( $embeddings ) ) {
  456. return $embeddings;
  457. }
  458. }
  459. // Chunk the post content down.
  460. $embeddings = [];
  461. $content = $this->get_normalized_content( $post_id, 'post' );
  462. $content_chunks = $this->chunk_content( $content );
  463. // Get the embeddings for each chunk.
  464. if ( ! empty( $content_chunks ) ) {
  465. $tokenizer = new Tokenizer( $this->get_max_tokens() );
  466. $total_tokens = $tokenizer->tokens_in_content( $content );
  467. // If we have a lot of tokens, we need to get embeddings for each chunk individually.
  468. if ( $this->max_tokens < $total_tokens ) {
  469. foreach ( $content_chunks as $chunk ) {
  470. $embedding = $this->generate_embedding( $chunk );
  471. if ( $embedding && ! is_wp_error( $embedding ) ) {
  472. $embeddings[] = array_map( 'floatval', $embedding );
  473. }
  474. }
  475. } else {
  476. // Otherwise let's get all embeddings in a single request.
  477. $all_embeddings = $this->generate_embeddings( $content_chunks );
  478. if ( $all_embeddings && ! is_wp_error( $all_embeddings ) ) {
  479. $embeddings = array_map(
  480. function ( $embedding ) {
  481. return array_map( 'floatval', $embedding );
  482. },
  483. $all_embeddings
  484. );
  485. }
  486. }
  487. }
  488. // Store the embeddings for future use.
  489. if ( ! empty( $embeddings ) ) {
  490. update_post_meta( $post_id, 'classifai_openai_embeddings', $embeddings );
  491. }
  492. return $embeddings;
  493. }
  494. /**
  495. * Add terms to a post based on embeddings.
  496. *
  497. * @param int $post_id ID of post to set terms on.
  498. * @param array $embeddings Embeddings data.
  499. * @param bool $link Whether to link the terms or not.
  500. * @return array|WP_Error
  501. */
  502. public function set_terms( int $post_id = 0, array $embeddings = [], bool $link = true ) {
  503. if ( ! $post_id || ! get_post( $post_id ) ) {
  504. return new WP_Error( 'post_id_required', esc_html__( 'A valid post ID is required to set terms.', 'classifai' ) );
  505. }
  506. if ( empty( $embeddings ) ) {
  507. return new WP_Error( 'data_required', esc_html__( 'Valid embedding data is required to set terms.', 'classifai' ) );
  508. }
  509. $embeddings_similarity = [];
  510. // Iterate through all of our embedding chunks and run our similarity calculations.
  511. foreach ( $embeddings as $embedding ) {
  512. $embeddings_similarity = array_merge( $embeddings_similarity, $this->get_embeddings_similarity( $embedding ) );
  513. }
  514. // Ensure we have some results.
  515. if ( empty( $embeddings_similarity ) ) {
  516. return new WP_Error( 'invalid', esc_html__( 'No matching terms found.', 'classifai' ) );
  517. }
  518. /**
  519. * Fires after the embeddings similarity has been run but before results are sorted.
  520. *
  521. * @since 3.3.1
  522. * @hook classifai_openai_embeddings_pre_sort_embeddings_similarity
  523. *
  524. * @param {array} $embeddings_similarity The embeddings similarity results.
  525. * @param {int} $post_id ID of post to set terms on.
  526. * @param {array} $embeddings Embeddings data.
  527. * @param {bool} $link Whether to link the terms or not.
  528. */
  529. do_action( 'classifai_openai_embeddings_pre_sort_embeddings_similarity', $embeddings_similarity, $post_id, $embeddings, $link );
  530. // Sort the results by similarity.
  531. usort(
  532. $embeddings_similarity,
  533. function ( $a, $b ) {
  534. return $a['similarity'] <=> $b['similarity'];
  535. }
  536. );
  537. // Remove duplicates based on the term_id field.
  538. $uniques = array_unique( array_column( $embeddings_similarity, 'term_id' ) );
  539. $embeddings_similarity = array_intersect_key( $embeddings_similarity, $uniques );
  540. $sorted_results = [];
  541. // Sort the results into taxonomy buckets.
  542. foreach ( $embeddings_similarity as $item ) {
  543. $sorted_results[ $item['taxonomy'] ][] = $item;
  544. }
  545. /**
  546. * Fires after the embeddings similarity has been run and sorted.
  547. *
  548. * @since 3.3.1
  549. * @hook classifai_openai_embeddings_post_sort_embeddings_similarity
  550. *
  551. * @param {array} $sorted_results The sorted embeddings similarity results.
  552. * @param {array} $embeddings_similarity The embeddings similarity results.
  553. * @param {int} $post_id ID of post to set terms on.
  554. * @param {array} $embeddings Embeddings data.
  555. * @param {bool} $link Whether to link the terms or not.
  556. */
  557. do_action( 'classifai_openai_embeddings_post_sort_embeddings_similarity', $sorted_results, $embeddings_similarity, $post_id, $embeddings, $link );
  558. $return = [];
  559. /**
  560. * If $link is true, immediately link all the terms
  561. * to the item.
  562. *
  563. * If it is false, build an array of term data that
  564. * can be used to display the terms in the UI.
  565. */
  566. foreach ( $sorted_results as $tax => $terms ) {
  567. if ( $link ) {
  568. wp_set_object_terms( $post_id, array_map( 'absint', array_column( $terms, 'term_id' ) ), $tax, false );
  569. } else {
  570. $terms_to_link = [];
  571. foreach ( $terms as $term ) {
  572. $found_term = get_term( $term['term_id'] );
  573. if ( $found_term && ! is_wp_error( $found_term ) ) {
  574. $terms_to_link[ $found_term->name ] = $term['term_id'];
  575. }
  576. }
  577. $return[ $tax ] = $terms_to_link;
  578. }
  579. }
  580. return empty( $return ) ? $embeddings_similarity : $return;
  581. }
  582. /**
  583. * Determine which terms best match a post based on embeddings.
  584. *
  585. * @param array $embeddings An array of embeddings data.
  586. * @return array|WP_Error
  587. */
  588. public function get_terms( array $embeddings = [] ) {
  589. if ( empty( $embeddings ) ) {
  590. return new WP_Error( 'data_required', esc_html__( 'Valid embedding data is required to get terms.', 'classifai' ) );
  591. }
  592. $embeddings_similarity = [];
  593. // Iterate through all of our embedding chunks and run our similarity calculations.
  594. foreach ( $embeddings as $embedding ) {
  595. $embeddings_similarity = array_merge( $embeddings_similarity, $this->get_embeddings_similarity( $embedding, false ) );
  596. }
  597. // Ensure we have some results.
  598. if ( empty( $embeddings_similarity ) ) {
  599. return new WP_Error( 'invalid', esc_html__( 'No matching terms found.', 'classifai' ) );
  600. }
  601. // Sort the results by similarity.
  602. usort(
  603. $embeddings_similarity,
  604. function ( $a, $b ) {
  605. return $a['similarity'] <=> $b['similarity'];
  606. }
  607. );
  608. // Remove duplicates based on the term_id field.
  609. $uniques = array_unique( array_column( $embeddings_similarity, 'term_id' ) );
  610. $embeddings_similarity = array_intersect_key( $embeddings_similarity, $uniques );
  611. $sorted_results = [];
  612. // Sort the results into taxonomy buckets.
  613. foreach ( $embeddings_similarity as $item ) {
  614. $sorted_results[ $item['taxonomy'] ][] = $item;
  615. }
  616. // Prepare the results.
  617. $results = [];
  618. foreach ( $sorted_results as $tax => $terms ) {
  619. // Get the taxonomy name.
  620. $taxonomy = get_taxonomy( $tax );
  621. $tax_name = $taxonomy->labels->singular_name;
  622. // Initialize the taxonomy bucket in results.
  623. $results[ $tax ] = [
  624. 'label' => $tax_name,
  625. 'data' => [],
  626. ];
  627. foreach ( $terms as $term ) {
  628. // Convert $similarity to percentage.
  629. $similarity = round( ( 1 - $term['similarity'] ), 10 );
  630. // Store the results.
  631. $results[ $tax ]['data'][] = [
  632. 'label' => get_term( $term['term_id'] )->name,
  633. 'score' => $similarity,
  634. ];
  635. }
  636. }
  637. return $results;
  638. }
  639. /**
  640. * Get the similarity between an embedding and all terms.
  641. *
  642. * @since 2.5.0
  643. *
  644. * @param array $embedding Embedding data.
  645. * @param bool $consider_threshold Whether to consider the threshold setting.
  646. * @return array
  647. */
  648. private function get_embeddings_similarity( array $embedding, bool $consider_threshold = true ): array {
  649. $feature = new Classification();
  650. $embedding_similarity = [];
  651. $taxonomies = $feature->get_all_feature_taxonomies();
  652. $calculations = new EmbeddingCalculations();
  653. foreach ( $taxonomies as $tax ) {
  654. $exclude = [];
  655. if ( is_numeric( $tax ) ) {
  656. continue;
  657. }
  658. if ( 'tags' === $tax ) {
  659. $tax = 'post_tag';
  660. }
  661. if ( 'categories' === $tax ) {
  662. $tax = 'category';
  663. // Exclude the uncategorized term.
  664. $uncat_term = get_term_by( 'name', 'Uncategorized', 'category' );
  665. if ( $uncat_term ) {
  666. $exclude = [ $uncat_term->term_id ];
  667. }
  668. }
  669. $terms = get_terms(
  670. [
  671. 'taxonomy' => $tax,
  672. 'orderby' => 'count',
  673. 'order' => 'DESC',
  674. 'hide_empty' => false,
  675. 'fields' => 'ids',
  676. 'meta_key' => 'classifai_openai_embeddings', // phpcs:ignore WordPress.DB.SlowDBQuery.slow_db_query_meta_key
  677. 'number' => $this->get_max_terms(),
  678. 'exclude' => $exclude, // phpcs:ignore WordPressVIPMinimum.Performance.WPQueryParams.PostNotIn_exclude
  679. ]
  680. );
  681. if ( is_wp_error( $terms ) || empty( $terms ) ) {
  682. continue;
  683. }
  684. // Get threshold setting for this taxonomy.
  685. $threshold = $this->get_threshold( $tax );
  686. // Get embedding similarity for each term.
  687. foreach ( $terms as $term_id ) {
  688. if ( ! current_user_can( 'assign_term', $term_id ) && ( ! defined( 'WP_CLI' ) || ! WP_CLI ) ) {
  689. continue;
  690. }
  691. $term_embedding = get_term_meta( $term_id, 'classifai_openai_embeddings', true );
  692. if ( ! empty( $term_embedding ) ) {
  693. // Loop through the chunks and run a similarity calculation on each.
  694. foreach ( $term_embedding as $chunk ) {
  695. $similarity = $calculations->cosine_similarity( $embedding, $chunk );
  696. /**
  697. * Fires after the embeddings similarity has been run for a single chunk.
  698. *
  699. * @since 3.3.1
  700. * @hook classifai_openai_embeddings_single_embedding_similarity
  701. *
  702. * @param {bool|float} $similarity The embeddings similarity result.
  703. * @param {array} $embedding Post embedding data.
  704. * @param {array} $chunk Term chunk embedding data.
  705. * @param {int} $term_id ID of term we're comparing.
  706. * @param {string} $tax Taxonomy of term.
  707. * @param {bool} $consider_threshold Whether to consider the threshold or not.
  708. */
  709. do_action( 'classifai_openai_embeddings_single_embedding_similarity', $similarity, $embedding, $chunk, $term_id, $tax, $consider_threshold );
  710. if ( false !== $similarity && ( ! $consider_threshold || $similarity <= $threshold ) ) {
  711. $embedding_similarity[] = [
  712. 'taxonomy' => $tax,
  713. 'term_id' => $term_id,
  714. 'similarity' => $similarity,
  715. ];
  716. }
  717. }
  718. }
  719. }
  720. }
  721. return $embedding_similarity;
  722. }
  723. /**
  724. * Schedules the job to generate embedding data for all terms within a taxonomy.
  725. *
  726. * @param string $taxonomy Taxonomy slug.
  727. * @param bool $all Whether to generate embeddings for all terms or just those without embeddings.
  728. * @param array $args Overridable query args for get_terms()
  729. * @param int $user_id The user ID to run this as.
  730. */
  731. public function trigger_taxonomy_update( string $taxonomy = '', bool $all = false, array $args = [], int $user_id = 0 ) {
  732. $feature = new Classification();
  733. if (
  734. ! $feature->is_feature_enabled() ||
  735. $feature->get_feature_provider_instance()::ID !== static::ID
  736. ) {
  737. return;
  738. }
  739. $exclude = [];
  740. // Exclude the uncategorized term.
  741. if ( 'category' === $taxonomy ) {
  742. $uncat_term = get_term_by( 'name', 'Uncategorized', 'category' );
  743. if ( $uncat_term ) {
  744. $exclude = [ $uncat_term->term_id ];
  745. }
  746. }
  747. /**
  748. * Filter the number of terms to process in a batch.
  749. *
  750. * @since 3.1.0
  751. * @hook classifai_openai_embeddings_terms_per_job
  752. *
  753. * @param {int} $number Number of terms to process per job.
  754. *
  755. * @return {int} Filtered number of terms to process per job.
  756. */
  757. $number = apply_filters( 'classifai_openai_embeddings_terms_per_job', 100 );
  758. $default_args = [
  759. 'taxonomy' => $taxonomy,
  760. 'orderby' => 'count',
  761. 'order' => 'DESC',
  762. 'hide_empty' => false,
  763. 'fields' => 'ids',
  764. 'meta_key' => 'classifai_openai_embeddings', // phpcs:ignore WordPress.DB.SlowDBQuery.slow_db_query_meta_key
  765. 'meta_compare' => 'NOT EXISTS',
  766. 'number' => $number,
  767. 'offset' => 0,
  768. 'exclude' => $exclude, // phpcs:ignore WordPressVIPMinimum.Performance.WPQueryParams.PostNotIn_exclude
  769. ];
  770. $default_args = array_merge( $default_args, $args );
  771. // If we want all terms, remove our meta query.
  772. if ( $all ) {
  773. unset( $default_args['meta_key'], $default_args['meta_compare'] );
  774. } else {
  775. unset( $default_args['offset'] );
  776. }
  777. if ( 0 === $user_id ) {
  778. $user_id = get_current_user_id();
  779. }
  780. $job_args = [
  781. 'taxonomy' => $taxonomy,
  782. 'all' => $all,
  783. 'args' => $default_args,
  784. 'user_id' => $user_id,
  785. ];
  786. // We return early and don't schedule the job if there are no terms.
  787. if ( function_exists( 'as_has_scheduled_action' ) && ! \as_has_scheduled_action( 'classifai_schedule_generate_embedding_job', $job_args ) ) {
  788. $terms = get_terms( $default_args );
  789. if ( is_wp_error( $terms ) || empty( $terms ) ) {
  790. return;
  791. }
  792. }
  793. if ( function_exists( 'as_enqueue_async_action' ) ) {
  794. \as_enqueue_async_action( 'classifai_schedule_generate_embedding_job', $job_args );
  795. }
  796. }
  797. /**
  798. * Job to generate embedding data for all terms within a taxonomy.
  799. *
  800. * @param string $taxonomy Taxonomy slug.
  801. * @param bool $all Whether to generate embeddings for all terms or just those without embeddings.
  802. * @param array $args Overridable query args for get_terms()
  803. * @param int $user_id The user ID to run this as.
  804. */
  805. public function generate_embedding_job( string $taxonomy = '', bool $all = false, array $args = [], int $user_id = 0 ) {
  806. if ( $user_id > 0 ) {
  807. // We set this as current_user_can() fails when this function runs
  808. // under the context of Action Scheduler.
  809. wp_set_current_user( $user_id );
  810. }
  811. $terms = get_terms( $args );
  812. if ( is_wp_error( $terms ) || empty( $terms ) ) {
  813. return;
  814. }
  815. // Re-orders the keys.
  816. $terms = array_values( $terms );
  817. $exclude = [];
  818. // Generate embedding data for each term.
  819. foreach ( $terms as $term_id ) {
  820. /** @var int $term_id */
  821. $has_generated = $this->generate_embeddings_for_term( $term_id, $all );
  822. if ( is_wp_error( $has_generated ) ) {
  823. $exclude[] = $term_id;
  824. }
  825. }
  826. if ( $all && isset( $args['offset'] ) && isset( $args['number'] ) ) {
  827. $args['offset'] = $args['offset'] + $args['number'];
  828. }
  829. if ( ! empty( $exclude ) ) {
  830. $args['exclude'] = array_merge( $args['exclude'], $exclude ); // phpcs:ignore WordPressVIPMinimum.Performance.WPQueryParams.PostNotIn_exclude
  831. }
  832. $this->trigger_taxonomy_update( $taxonomy, $all, $args, $user_id );
  833. }
  834. /**
  835. * Trigger embedding generation for term being saved.
  836. *
  837. * @param int $term_id ID of term being saved.
  838. * @param bool $force Whether to force generation of embeddings even if they already exist. Default false.
  839. * @param Feature $feature The feature instance.
  840. * @return array|WP_Error
  841. */
  842. public function generate_embeddings_for_term( int $term_id, bool $force = false, ?Feature $feature = null ) {
  843. // Ensure the user has permissions to edit.
  844. if ( ! current_user_can( 'edit_term', $term_id ) ) {
  845. return new WP_Error( 'invalid', esc_html__( 'User does not have valid permissions to edit this term.', 'classifai' ) );
  846. }
  847. $term = get_term( $term_id );
  848. if ( ! is_a( $term, '\WP_Term' ) ) {
  849. return new WP_Error( 'invalid', esc_html__( 'This is not a valid term.', 'classifai' ) );
  850. }
  851. if ( ! $feature ) {
  852. $feature = new Classification();
  853. }
  854. $taxonomies = $feature->get_all_feature_taxonomies();
  855. if ( in_array( 'tags', $taxonomies, true ) ) {
  856. $taxonomies[] = 'post_tag';
  857. }
  858. if ( in_array( 'categories', $taxonomies, true ) ) {
  859. $taxonomies[] = 'category';
  860. }
  861. // Ensure this term is part of a taxonomy we support.
  862. if ( ! in_array( $term->taxonomy, $taxonomies, true ) ) {
  863. return new WP_Error( 'invalid', esc_html__( 'This taxonomy is not supported.', 'classifai' ) );
  864. }
  865. /**
  866. * Filter whether ClassifAI should classify an item.
  867. *
  868. * Default is true, return false to skip classifying.
  869. *
  870. * @since 2.2.0
  871. * @hook classifai_openai_embeddings_should_classify
  872. *
  873. * @param {bool} $should_classify Whether the item should be classified. Default `true`, return `false` to skip.
  874. * @param {int} $id The ID of the item to be considered for classification.
  875. * @param {string} $type The type of item to be considered for classification.
  876. *
  877. * @return {bool} Whether the item should be classified.
  878. */
  879. if ( ! apply_filters( 'classifai_openai_embeddings_should_classify', true, $term_id, 'term' ) ) {
  880. return new WP_Error( 'invalid', esc_html__( 'Classification is disabled for this item.', 'classifai' ) );
  881. }
  882. // Try to use the stored embeddings first.
  883. $embeddings = get_term_meta( $term_id, 'classifai_openai_embeddings', true );
  884. if ( ! empty( $embeddings ) && ! $force ) {
  885. return $embeddings;
  886. }
  887. // Chunk the term content down.
  888. $embeddings = [];
  889. $content = $this->get_normalized_content( $term_id, 'term' );
  890. $content_chunks = $this->chunk_content( $content );
  891. // Get the embeddings for each chunk.
  892. if ( ! empty( $content_chunks ) ) {
  893. foreach ( $content_chunks as $chunk ) {
  894. $embedding = $this->generate_embedding( $chunk, $feature );
  895. if ( $embedding && ! is_wp_error( $embedding ) ) {
  896. $embeddings[] = array_map( 'floatval', $embedding );
  897. }
  898. }
  899. }
  900. // Store the embeddings for future use.
  901. if ( ! empty( $embeddings ) ) {
  902. update_term_meta( $term_id, 'classifai_openai_embeddings', $embeddings );
  903. }
  904. return $embeddings;
  905. }
  906. /**
  907. * Generate an embedding for a particular piece of text.
  908. *
  909. * @param string $text Text to generate the embedding for.
  910. * @param Feature|null $feature Feature instance.
  911. * @return array|boolean|WP_Error
  912. */
  913. public function generate_embedding( string $text = '', $feature = null ) {
  914. if ( ! $feature ) {
  915. $feature = new Classification();
  916. }
  917. $settings = $feature->get_settings();
  918. // Ensure the feature is enabled.
  919. if ( ! $feature->is_feature_enabled() ) {
  920. return new WP_Error( 'not_enabled', esc_html__( 'Classification is disabled or OpenAI authentication failed. Please check your settings.', 'classifai' ) );
  921. }
  922. $request = new APIRequest( $settings[ static::ID ]['api_key'] ?? '', $feature->get_option_name() );
  923. /**
  924. * Filter the request body before sending to OpenAI.
  925. *
  926. * @since 2.2.0
  927. * @hook classifai_openai_embeddings_request_body
  928. *
  929. * @param {array} $body Request body that will be sent to OpenAI.
  930. * @param {string} $text Text we are getting embeddings for.
  931. *
  932. * @return {array} Request body.
  933. */
  934. $body = apply_filters(
  935. 'classifai_openai_embeddings_request_body',
  936. [
  937. 'model' => $this->get_model(),
  938. 'input' => $text,
  939. 'dimensions' => $this->get_dimensions(),
  940. ],
  941. $text
  942. );
  943. // Make our API request.
  944. $response = $request->post(
  945. $this->get_api_url(),
  946. [
  947. 'body' => wp_json_encode( $body ),
  948. ]
  949. );
  950. set_transient( 'classifai_openai_embeddings_latest_response', $response, DAY_IN_SECONDS * 30 );
  951. if ( is_wp_error( $response ) ) {
  952. return $response;
  953. }
  954. if ( empty( $response['data'] ) ) {
  955. return new WP_Error( 'no_data', esc_html__( 'No data returned from OpenAI.', 'classifai' ) );
  956. }
  957. $return = [];
  958. // Parse out the embeddings response.
  959. foreach ( $response['data'] as $data ) {
  960. if ( ! isset( $data['embedding'] ) || ! is_array( $data['embedding'] ) ) {
  961. continue;
  962. }
  963. $return = $data['embedding'];
  964. break;
  965. }
  966. return $return;
  967. }
  968. /**
  969. * Generate embeddings for an array of text.
  970. *
  971. * @param array $strings Array of text to generate embeddings for.
  972. * @param Feature|null $feature Feature instance.
  973. * @return array|boolean|WP_Error
  974. */
  975. public function generate_embeddings( array $strings = [], $feature = null ) {
  976. if ( ! $feature ) {
  977. $feature = new Classification();
  978. }
  979. $settings = $feature->get_settings();
  980. // Ensure the feature is enabled.
  981. if ( ! $feature->is_feature_enabled() ) {
  982. return new WP_Error( 'not_enabled', esc_html__( 'Classification is disabled or OpenAI authentication failed. Please check your settings.', 'classifai' ) );
  983. }
  984. $request = new APIRequest( $settings[ static::ID ]['api_key'] ?? '', $feature->get_option_name() );
  985. /**
  986. * Filter the request body before sending to OpenAI.
  987. *
  988. * @since 2.2.0
  989. * @hook classifai_openai_embeddings_request_body
  990. *
  991. * @param {array} $body Request body that will be sent to OpenAI.
  992. * @param {array} $strings Array of text we are getting embeddings for.
  993. *
  994. * @return {array} Request body.
  995. */
  996. $body = apply_filters(
  997. 'classifai_openai_embeddings_request_body',
  998. [
  999. 'model' => $this->get_model(),
  1000. 'input' => $strings,
  1001. 'dimensions' => $this->get_dimensions(),
  1002. ],
  1003. $strings
  1004. );
  1005. // Make our API request.
  1006. $response = $request->post(
  1007. $this->get_api_url(),
  1008. [
  1009. 'body' => wp_json_encode( $body ),
  1010. ]
  1011. );
  1012. if ( is_wp_error( $response ) ) {
  1013. return $response;
  1014. }
  1015. if ( empty( $response['data'] ) ) {
  1016. return new WP_Error( 'no_data', esc_html__( 'No data returned from OpenAI.', 'classifai' ) );
  1017. }
  1018. $return = [];
  1019. // Parse out the embeddings response.
  1020. foreach ( $response['data'] as $data ) {
  1021. if ( ! isset( $data['embedding'] ) || ! is_array( $data['embedding'] ) ) {
  1022. continue;
  1023. }
  1024. $return[] = $data['embedding'];
  1025. }
  1026. return $return;
  1027. }
  1028. /**
  1029. * Chunk content into smaller pieces with an overlap.
  1030. *
  1031. * @param string $content Content to chunk.
  1032. * @param int $chunk_size Size of each chunk, in words.
  1033. * @param int $overlap_size Overlap size for each chunk, in words.
  1034. * @return array
  1035. */
  1036. public function chunk_content( string $content = '', int $chunk_size = 150, $overlap_size = 25 ): array {
  1037. // Remove multiple whitespaces.
  1038. $content = preg_replace( '/\s+/', ' ', $content );
  1039. // Split text by single whitespace.
  1040. $words = explode( ' ', $content );
  1041. $chunks = [];
  1042. $text_count = count( $words );
  1043. // Iterate through and chunk data with an overlap.
  1044. for ( $i = 0; $i < $text_count; $i += $chunk_size ) {
  1045. // Join a set of words into a string.
  1046. $chunk = implode(
  1047. ' ',
  1048. array_slice(
  1049. $words,
  1050. max( $i - $overlap_size, 0 ),
  1051. $chunk_size + $overlap_size
  1052. )
  1053. );
  1054. array_push( $chunks, $chunk );
  1055. }
  1056. return $chunks;
  1057. }
  1058. /**
  1059. * Get our content, ensuring it is normalized.
  1060. *
  1061. * @param int $id ID of item to get content from.
  1062. * @param string $type Type of content. Default 'post'.
  1063. * @return string
  1064. */
  1065. public function get_normalized_content( int $id = 0, string $type = 'post' ): string {
  1066. $normalizer = new Normalizer();
  1067. $content = '';
  1068. // Get the content depending on the type.
  1069. switch ( $type ) {
  1070. case 'post':
  1071. // This will include the post_title and post_content.
  1072. $content = $normalizer->normalize( $id );
  1073. break;
  1074. case 'term':
  1075. $content = '';
  1076. $term = get_term( $id );
  1077. if ( is_a( $term, '\WP_Term' ) ) {
  1078. $content = $term->name . ' ' . $term->slug . ' ' . $term->description;
  1079. }
  1080. break;
  1081. }
  1082. /**
  1083. * Filter content that will get sent to OpenAI.
  1084. *
  1085. * @since 2.2.0
  1086. * @hook classifai_openai_embeddings_content
  1087. *
  1088. * @param {string} $content Content that will be sent to OpenAI.
  1089. * @param {int} $post_id ID of post we are submitting.
  1090. * @param {string} $type Type of content.
  1091. *
  1092. * @return {string} Content.
  1093. */
  1094. return apply_filters( 'classifai_openai_embeddings_content', $content, $id, $type );
  1095. }
  1096. /**
  1097. * Common entry point for all REST endpoints for this provider.
  1098. *
  1099. * @param int $post_id The Post Id we're processing.
  1100. * @param string $route_to_call The route we are processing.
  1101. * @param array $args Optional arguments to pass to the route.
  1102. * @return array|string|WP_Error
  1103. */
  1104. public function rest_endpoint_callback( $post_id = 0, string $route_to_call = '', array $args = [] ) {
  1105. if ( ! $post_id || ! get_post( $post_id ) ) {
  1106. return new WP_Error( 'post_id_required', esc_html__( 'A valid post ID is required to run classification.', 'classifai' ) );
  1107. }
  1108. $route_to_call = strtolower( $route_to_call );
  1109. $return = '';
  1110. // Handle all of our routes.
  1111. switch ( $route_to_call ) {
  1112. case 'classify':
  1113. $return = $this->generate_embeddings_for_post( $post_id, true );
  1114. break;
  1115. }
  1116. return $return;
  1117. }
  1118. /**
  1119. * Returns the debug information for the provider settings.
  1120. *
  1121. * @return array
  1122. */
  1123. public function get_debug_information(): array {
  1124. $settings = $this->feature_instance->get_settings();
  1125. $debug_info = [];
  1126. if ( $this->feature_instance instanceof Classification ) {
  1127. foreach ( array_keys( $this->feature_instance->get_supported_taxonomies() ) as $tax ) {
  1128. $debug_info[ "Taxonomy ($tax)" ] = Feature::get_debug_value_text( $settings[ $tax ], 1 );
  1129. $debug_info[ "Taxonomy ($tax threshold)" ] = Feature::get_debug_value_text( $settings[ $tax . '_threshold' ], 1 );
  1130. }
  1131. $debug_info[ __( 'Latest response', 'classifai' ) ] = $this->get_formatted_latest_response( get_transient( 'classifai_openai_embeddings_latest_response' ) );
  1132. }
  1133. return apply_filters(
  1134. 'classifai_' . self::ID . '_debug_information',
  1135. $debug_info,
  1136. $settings,
  1137. $this->feature_instance
  1138. );
  1139. }
  1140. /**
  1141. * Get embeddings generation status.
  1142. *
  1143. * @return bool
  1144. */
  1145. public function is_embeddings_generation_in_progress(): bool {
  1146. return self::$scheduler_instance->is_embeddings_generation_in_progress();
  1147. }
  1148. }