Source: Providers/Azure/Read.php

  1. <?php
  2. /**
  3. * Scan PDF files to extract visible text with the AI Vision Read service.
  4. *
  5. * @since 1.6.1
  6. * @package Classifai
  7. */
  8. namespace Classifai\Providers\Azure;
  9. use WP_Error;
  10. use function Classifai\computer_vision_max_filesize;
  11. /**
  12. * Read class
  13. *
  14. * Connects to AI Vision's Read endpoint to detect text.
  15. *
  16. * @see https://docs.microsoft.com/en-us/rest/api/cognitiveservices/computervision/recognizeprintedtext/
  17. */
  18. class Read {
  19. /**
  20. * The AI Vision API path to the Read service.
  21. *
  22. * @var string
  23. */
  24. const API_PATH = 'vision/v3.2/read/';
  25. /**
  26. * ComputerVision settings.
  27. *
  28. * @var array
  29. */
  30. private $settings;
  31. /**
  32. * Attachment ID to process.
  33. *
  34. * @var int
  35. */
  36. private $attachment_id;
  37. /**
  38. * Feature instance.
  39. *
  40. * @var \Classifai\Features\PDFTextExtraction
  41. */
  42. private $feature;
  43. /**
  44. * Constructor
  45. *
  46. * @param array $settings Computer Vision settings.
  47. * @param int $attachment_id Attachment ID to process.
  48. * @param PDFTextExtraction $feature Feature instance.
  49. */
  50. public function __construct( array $settings, int $attachment_id, $feature ) {
  51. $this->settings = $settings;
  52. $this->attachment_id = $attachment_id;
  53. $this->feature = $feature;
  54. }
  55. /**
  56. * Builds the API url.
  57. *
  58. * @param string $path Path to append to API URL.
  59. * @return string
  60. */
  61. public function get_api_url( string $path = '' ): string {
  62. return sprintf( '%s%s%s', trailingslashit( $this->settings['endpoint_url'] ), static::API_PATH, $path );
  63. }
  64. /**
  65. * Check if Read processing should be applied to the attachment.
  66. *
  67. * @return bool
  68. */
  69. public function should_process(): bool {
  70. $mime_type = get_post_mime_type( $this->attachment_id );
  71. $matched_extensions = explode( '|', array_search( $mime_type, wp_get_mime_types(), true ) );
  72. $process = false;
  73. $approved_media_types = [ 'pdf' ];
  74. foreach ( $matched_extensions as $ext ) {
  75. if ( in_array( $ext, $approved_media_types, true ) ) {
  76. $process = true;
  77. }
  78. }
  79. /**
  80. * Filters whether to run Read processing on this attachment item
  81. *
  82. * @since 1.7.0
  83. * @hook classifai_azure_read_should_process
  84. *
  85. * @param {bool} $process Whether to run OCR processing or not.
  86. * @param {int} $attachment_id The attachment ID.
  87. *
  88. * @return {bool} Whether this attachment should have OCR processing.
  89. */
  90. return apply_filters( 'classifai_azure_read_should_process', $process, $this->attachment_id );
  91. }
  92. /**
  93. * Call the Azure Read API.
  94. *
  95. * @return object|WP_Error
  96. */
  97. public function read_document() {
  98. // Check if valid authentication is in place.
  99. if ( empty( $this->settings ) || ( isset( $this->settings['authenticated'] ) && false === $this->settings['authenticated'] ) ) {
  100. return $this->log_error( new WP_Error( 'auth', esc_html__( 'Please set up valid authentication with Azure.', 'classifai' ) ) );
  101. }
  102. if ( ! $this->should_process() ) {
  103. return $this->log_error( new WP_Error( 'process_error', esc_html__( 'Document does not meet processing requirements.', 'classifai' ) ) );
  104. }
  105. $filesize = filesize( get_attached_file( $this->attachment_id ) );
  106. if ( ! $filesize || $filesize > computer_vision_max_filesize() ) {
  107. return $this->log_error(
  108. new WP_Error(
  109. 'size_error',
  110. esc_html(
  111. sprintf(
  112. // translators: %1$s is the document file size in bytes, %2$s is the current default max filesize in bytes, %3$s is the integer '4 * MB_IN_BYTES'
  113. __( 'Document (%1$s bytes) does not meet size requirements. Please ensure it is smaller than the maximum threshold (currently %2$s bytes, defaults to %3$s bytes).', 'classifai' ),
  114. ! $filesize ? __( 'size not found', 'classifai' ) : $filesize,
  115. computer_vision_max_filesize(),
  116. 4 * MB_IN_BYTES
  117. )
  118. ),
  119. $filesize
  120. )
  121. );
  122. }
  123. /**
  124. * Filters the request arguments sent to Read endpoint.
  125. *
  126. * @since 1.7.0
  127. * @hook classifai_azure_read_request_args
  128. *
  129. * @param {array} $args Whether to run OCR processing or not.
  130. * @param {int} $attachment_id The attachment ID.
  131. *
  132. * @return {array} Filtered request arguments.
  133. */
  134. $request_args = apply_filters( 'classifai_azure_read_request_args', [], $this->attachment_id );
  135. $url = add_query_arg(
  136. $request_args,
  137. $this->get_api_url( 'analyze' )
  138. );
  139. $document_url = wp_get_attachment_url( $this->attachment_id );
  140. if ( ! $document_url ) {
  141. return $this->log_error( new WP_Error( 'invalid_attachment', esc_html__( 'Document does not exist.', 'classifai' ) ) );
  142. }
  143. $response = wp_remote_post(
  144. $url,
  145. [
  146. 'body' => wp_json_encode(
  147. [
  148. 'url' => $document_url,
  149. ]
  150. ),
  151. 'headers' => [
  152. 'Content-Type' => 'application/json',
  153. 'Ocp-Apim-Subscription-Key' => $this->settings['api_key'],
  154. ],
  155. ]
  156. );
  157. /**
  158. * Fires after the request to the read endpoint has run.
  159. *
  160. * @since 1.5.0
  161. * @hook classifai_azure_read_after_request
  162. *
  163. * @param {array|WP_Error} Response data or a WP_Error if the request failed.
  164. * @param {string} The request URL with query args added.
  165. * @param {int} The document ID.
  166. * @param {string} The document URL.
  167. */
  168. do_action( 'classifai_azure_read_after_request', $response, $url, $this->attachment_id, $document_url );
  169. if ( is_wp_error( $response ) ) {
  170. return $this->log_error( $response );
  171. }
  172. if ( 202 === wp_remote_retrieve_response_code( $response ) ) {
  173. $operation_url = wp_remote_retrieve_header( $response, 'Operation-Location' );
  174. if ( ! filter_var( $operation_url, FILTER_VALIDATE_URL ) ) {
  175. return $this->log_error( new WP_Error( 'invalid_read_operation_url', esc_html__( 'Operation URL is invalid.', 'classifai' ) ) );
  176. }
  177. return $this->check_read_result( $operation_url );
  178. }
  179. $body = json_decode( wp_remote_retrieve_body( $response ), true );
  180. if ( empty( $body['error'] ) || empty( $body['error']['code'] ) || empty( $body['error']['message'] ) ) {
  181. return $this->log_error( new WP_Error( 'unknown_read_error', esc_html__( 'Unknown Read error.', 'classifai' ) ) );
  182. }
  183. return $this->log_error( new WP_Error( $body['error']['code'], $body['error']['message'] ) );
  184. }
  185. /**
  186. * Use WP Cron to periodically check the status of the read operation.
  187. *
  188. * @param string $operation_url Operation URL for checking the read status.
  189. * @return WP_Error|null|array
  190. */
  191. public function check_read_result( string $operation_url ) {
  192. if ( function_exists( 'vip_safe_wp_remote_get' ) ) {
  193. $response = vip_safe_wp_remote_get( $operation_url );
  194. } else {
  195. // phpcs:ignore WordPressVIPMinimum.Functions.RestrictedFunctions.wp_remote_get_wp_remote_get -- use of `vip_safe_wp_remote_get` is done when available.
  196. $response = wp_remote_get(
  197. $operation_url,
  198. [
  199. 'headers' => [
  200. 'Ocp-Apim-Subscription-Key' => $this->settings['api_key'],
  201. ],
  202. ]
  203. );
  204. }
  205. set_transient( 'classifai_azure_computer_vision_pdf_text_extraction_check_result_latest_response', $response, DAY_IN_SECONDS * 30 );
  206. if ( is_wp_error( $response ) ) {
  207. return $response;
  208. }
  209. if ( 200 === wp_remote_retrieve_response_code( $response ) ) {
  210. $body = json_decode( wp_remote_retrieve_body( $response ), true );
  211. if ( empty( $body['status'] ) ) {
  212. return $this->log_error( new WP_Error( 'invalid_read_result', esc_html__( 'Invalid Read result.', 'classifai' ) ) );
  213. }
  214. switch ( $body['status'] ) {
  215. case 'notStarted':
  216. case 'running':
  217. $this->update_status( $body );
  218. /**
  219. * Filters the Read retry interval.
  220. *
  221. * @since 1.7.0
  222. * @hook classifai_azure_read_retry_interval
  223. *
  224. * @param {int} $seconds How many seconds should the interval be? Default 60.
  225. *
  226. * @return {int} Filtered interval.
  227. */
  228. $retry_interval = apply_filters( 'classifai_azure_read_retry_interval', MINUTE_IN_SECONDS );
  229. wp_schedule_single_event( time() + $retry_interval, 'classifai_retry_get_read_result', [ $operation_url, $this->attachment_id ] );
  230. break;
  231. case 'failed':
  232. return $this->log_error( new WP_Error( 'failed_read_request', esc_html__( 'The Read operation has failed.', 'classifai' ) ) );
  233. case 'succeeded':
  234. return $this->update_document_description( $body );
  235. default:
  236. return $this->log_error( new WP_Error( 'invalid_read_result_status', esc_html__( 'Invalid Read result status.', 'classifai' ) ) );
  237. }
  238. }
  239. }
  240. /**
  241. * Update document description using text received from Read API.
  242. *
  243. * @param array $data Read result.
  244. * @return WP_Error|array|null
  245. */
  246. public function update_document_description( array $data ) {
  247. if ( empty( $data['analyzeResult'] ) || empty( $data['analyzeResult']['readResults'] ) ) {
  248. return $this->log_error( new WP_Error( 'invalid_read_result', esc_html__( 'The Read result is invalid.', 'classifai' ) ) );
  249. }
  250. /**
  251. * Filter the max pages that can be processed.
  252. *
  253. * @since 1.7.0
  254. * @hook classifai_azure_read_result_max_page
  255. *
  256. * @param {int} $max_page The maximum pages that are read.
  257. *
  258. * @return {int} Filtered max pages.
  259. */
  260. $max_page = min( apply_filters( 'classifai_azure_read_result_max_page', 2 ), count( $data['analyzeResult']['readResults'] ) );
  261. $lines_of_text = [];
  262. for ( $page = 0; $page < $max_page; $page++ ) {
  263. foreach ( $data['analyzeResult']['readResults'][ $page ]['lines'] as $line ) {
  264. $lines_of_text[] = $line['text'];
  265. }
  266. }
  267. /**
  268. * Filter the text result returned from Read API.
  269. *
  270. * @since 1.7.0
  271. * @hook classifai_azure_read_text_result
  272. *
  273. * @param {array} $lines_of_text Array of text extracted from the response.
  274. * @param {int} $attachment_id The attachment ID.
  275. * @param {array} $data Read result.
  276. *
  277. * @return {array} Filtered array of text.
  278. */
  279. $lines_of_text = apply_filters( 'classifai_azure_read_text_result', $lines_of_text, $this->attachment_id, $data );
  280. $update = $this->feature->save( implode( ' ', $lines_of_text ), $this->attachment_id );
  281. if ( is_wp_error( $update ) ) {
  282. return $this->log_error( $update );
  283. }
  284. $this->update_status( $data );
  285. }
  286. /**
  287. * Log error to metadata for troubleshooting.
  288. *
  289. * @param WP_Error $error WP_Error object.
  290. */
  291. private function log_error( WP_Error $error ) {
  292. update_post_meta( $this->attachment_id, '_classifai_azure_read_error', $error->get_error_message() );
  293. return $error;
  294. }
  295. /**
  296. * Log the status of read process to database.
  297. *
  298. * @see https://centraluseuap.dev.cognitive.microsoft.com/docs/services/computer-vision-v3-2/operations/5d9869604be85dee480c8750
  299. *
  300. * @param array $data Response body of the read result.
  301. * @return array
  302. */
  303. private function update_status( array $data ): array {
  304. update_post_meta( $this->attachment_id, '_classifai_azure_read_status', $data );
  305. return $data;
  306. }
  307. }