<?php
/**
* Scan PDF files to extract visible text with the AI Vision Read service.
*
* @since 1.6.1
* @package Classifai
*/
namespace Classifai\Providers\Azure;
use WP_Error;
use function Classifai\computer_vision_max_filesize;
/**
* Read class
*
* Connects to AI Vision's Read endpoint to detect text.
*
* @see https://docs.microsoft.com/en-us/rest/api/cognitiveservices/computervision/recognizeprintedtext/
*/
class Read {
/**
* The AI Vision API path to the Read service.
*
* @var string
*/
const API_PATH = 'vision/v3.2/read/';
/**
* ComputerVision settings.
*
* @var array
*/
private $settings;
/**
* Attachment ID to process.
*
* @var int
*/
private $attachment_id;
/**
* Feature instance.
*
* @var \Classifai\Features\PDFTextExtraction
*/
private $feature;
/**
* Constructor
*
* @param array $settings Computer Vision settings.
* @param int $attachment_id Attachment ID to process.
* @param PDFTextExtraction $feature Feature instance.
*/
public function __construct( array $settings, int $attachment_id, $feature ) {
$this->settings = $settings;
$this->attachment_id = $attachment_id;
$this->feature = $feature;
}
/**
* Builds the API url.
*
* @param string $path Path to append to API URL.
* @return string
*/
public function get_api_url( string $path = '' ): string {
return sprintf( '%s%s%s', trailingslashit( $this->settings['endpoint_url'] ), static::API_PATH, $path );
}
/**
* Check if Read processing should be applied to the attachment.
*
* @return bool
*/
public function should_process(): bool {
$mime_type = get_post_mime_type( $this->attachment_id );
$matched_extensions = explode( '|', array_search( $mime_type, wp_get_mime_types(), true ) );
$process = false;
$approved_media_types = [ 'pdf' ];
foreach ( $matched_extensions as $ext ) {
if ( in_array( $ext, $approved_media_types, true ) ) {
$process = true;
}
}
/**
* Filters whether to run Read processing on this attachment item
*
* @since 1.7.0
* @hook classifai_azure_read_should_process
*
* @param {bool} $process Whether to run OCR processing or not.
* @param {int} $attachment_id The attachment ID.
*
* @return {bool} Whether this attachment should have OCR processing.
*/
return apply_filters( 'classifai_azure_read_should_process', $process, $this->attachment_id );
}
/**
* Call the Azure Read API.
*
* @return object|WP_Error
*/
public function read_document() {
// Check if valid authentication is in place.
if ( empty( $this->settings ) || ( isset( $this->settings['authenticated'] ) && false === $this->settings['authenticated'] ) ) {
return $this->log_error( new WP_Error( 'auth', esc_html__( 'Please set up valid authentication with Azure.', 'classifai' ) ) );
}
if ( ! $this->should_process( $this->attachment_id ) ) {
return $this->log_error( new WP_Error( 'process_error', esc_html__( 'Document does not meet processing requirements.', 'classifai' ) ) );
}
$filesize = filesize( get_attached_file( $this->attachment_id ) );
if ( ! $filesize || $filesize > computer_vision_max_filesize() ) {
return $this->log_error(
new WP_Error(
'size_error',
esc_html(
sprintf(
// translators: %1$s is the document file size in bytes, %2$s is the current default max filesize in bytes, %3$s is the integer '4 * MB_IN_BYTES'
__( 'Document (%1$s bytes) does not meet size requirements. Please ensure it is smaller than the maximum threshold (currently %2$s bytes, defaults to %3$s bytes).', 'classifai' ),
! $filesize ? __( 'size not found', 'classifai' ) : $filesize,
computer_vision_max_filesize(),
4 * MB_IN_BYTES
)
),
$filesize
)
);
}
/**
* Filters the request arguments sent to Read endpoint.
*
* @since 1.7.0
* @hook classifai_azure_read_request_args
*
* @param {array} $args Whether to run OCR processing or not.
* @param {int} $attachment_id The attachment ID.
*
* @return {array} Filtered request arguments.
*/
$request_args = apply_filters( 'classifai_azure_read_request_args', [], $this->attachment_id );
$url = add_query_arg(
$request_args,
$this->get_api_url( 'analyze' )
);
$document_url = wp_get_attachment_url( $this->attachment_id );
if ( ! $document_url ) {
return $this->log_error( new WP_Error( 'invalid_attachment', esc_html__( 'Document does not exist.', 'classifai' ) ) );
}
$response = wp_remote_post(
$url,
[
'body' => wp_json_encode(
[
'url' => $document_url,
]
),
'headers' => [
'Content-Type' => 'application/json',
'Ocp-Apim-Subscription-Key' => $this->settings['api_key'],
],
]
);
/**
* Fires after the request to the read endpoint has run.
*
* @since 1.5.0
* @hook classifai_azure_read_after_request
*
* @param {array|WP_Error} Response data or a WP_Error if the request failed.
* @param {string} The request URL with query args added.
* @param {int} The document ID.
* @param {string} The document URL.
*/
do_action( 'classifai_azure_read_after_request', $response, $url, $this->attachment_id, $document_url );
if ( is_wp_error( $response ) ) {
return $this->log_error( $response );
}
if ( 202 === wp_remote_retrieve_response_code( $response ) ) {
$operation_url = wp_remote_retrieve_header( $response, 'Operation-Location' );
if ( ! filter_var( $operation_url, FILTER_VALIDATE_URL ) ) {
return $this->log_error( new WP_Error( 'invalid_read_operation_url', esc_html__( 'Operation URL is invalid.', 'classifai' ) ) );
}
return $this->check_read_result( $operation_url );
}
$body = json_decode( wp_remote_retrieve_body( $response ), true );
if ( empty( $body['error'] ) || empty( $body['error']['code'] ) || empty( $body['error']['message'] ) ) {
return $this->log_error( new WP_Error( 'unknown_read_error', esc_html__( 'Unknown Read error.', 'classifai' ) ) );
}
return $this->log_error( new WP_Error( $body['error']['code'], $body['error']['message'] ) );
}
/**
* Use WP Cron to periodically check the status of the read operation.
*
* @param string $operation_url Operation URL for checking the read status.
* @return WP_Error|null|array
*/
public function check_read_result( string $operation_url ) {
if ( function_exists( 'vip_safe_wp_remote_get' ) ) {
$response = vip_safe_wp_remote_get( $operation_url );
} else {
// phpcs:ignore WordPressVIPMinimum.Functions.RestrictedFunctions.wp_remote_get_wp_remote_get -- use of `vip_safe_wp_remote_get` is done when available.
$response = wp_remote_get(
$operation_url,
[
'headers' => [
'Ocp-Apim-Subscription-Key' => $this->settings['api_key'],
],
]
);
}
set_transient( 'classifai_azure_computer_vision_pdf_text_extraction_check_result_latest_response', $response, DAY_IN_SECONDS * 30 );
if ( is_wp_error( $response ) ) {
return $response;
}
if ( 200 === wp_remote_retrieve_response_code( $response ) ) {
$body = json_decode( wp_remote_retrieve_body( $response ), true );
if ( empty( $body['status'] ) ) {
return $this->log_error( new WP_Error( 'invalid_read_result', esc_html__( 'Invalid Read result.', 'classifai' ) ) );
}
switch ( $body['status'] ) {
case 'notStarted':
case 'running':
$this->update_status( $body );
/**
* Filters the Read retry interval.
*
* @since 1.7.0
* @hook classifai_azure_read_retry_interval
*
* @param {int} $seconds How many seconds should the interval be? Default 60.
*
* @return {int} Filtered interval.
*/
$retry_interval = apply_filters( 'classifai_azure_read_retry_interval', MINUTE_IN_SECONDS );
wp_schedule_single_event( time() + $retry_interval, 'classifai_retry_get_read_result', [ $operation_url, $this->attachment_id ] );
break;
case 'failed':
return $this->log_error( new WP_Error( 'failed_read_request', esc_html__( 'The Read operation has failed.', 'classifai' ) ) );
case 'succeeded':
return $this->update_document_description( $body );
default:
return $this->log_error( new WP_Error( 'invalid_read_result_status', esc_html__( 'Invalid Read result status.', 'classifai' ) ) );
}
}
}
/**
* Update document description using text received from Read API.
*
* @param array $data Read result.
* @return WP_Error|array
*/
public function update_document_description( array $data ) {
if ( empty( $data['analyzeResult'] ) || empty( $data['analyzeResult']['readResults'] ) ) {
return $this->log_error( new WP_Error( 'invalid_read_result', esc_html__( 'The Read result is invalid.', 'classifai' ) ) );
}
/**
* Filter the max pages that can be processed.
*
* @since 1.7.0
* @hook classifai_azure_read_result_max_page
*
* @param {int} $max_page The maximum pages that are read.
*
* @return {int} Filtered max pages.
*/
$max_page = min( apply_filters( 'classifai_azure_read_result_max_page', 2 ), count( $data['analyzeResult']['readResults'] ) );
$lines_of_text = [];
for ( $page = 0; $page < $max_page; $page++ ) {
foreach ( $data['analyzeResult']['readResults'][ $page ]['lines'] as $line ) {
$lines_of_text[] = $line['text'];
}
}
/**
* Filter the text result returned from Read API.
*
* @since 1.7.0
* @hook classifai_azure_read_text_result
*
* @param {array} $lines_of_text Array of text extracted from the response.
* @param {int} $attachment_id The attachment ID.
* @param {array} $data Read result.
*
* @return {array} Filtered array of text.
*/
$lines_of_text = apply_filters( 'classifai_azure_read_text_result', $lines_of_text, $this->attachment_id, $data );
$update = $this->feature->save( implode( ' ', $lines_of_text ), $this->attachment_id );
if ( is_wp_error( $update ) ) {
return $this->log_error( $update );
}
$this->update_status( $data );
}
/**
* Log error to metadata for troubleshooting.
*
* @param WP_Error $error WP_Error object.
*/
private function log_error( WP_Error $error ) {
update_post_meta( $this->attachment_id, '_classifai_azure_read_error', $error->get_error_message() );
return $error;
}
/**
* Log the status of read process to database.
*
* @see https://centraluseuap.dev.cognitive.microsoft.com/docs/services/computer-vision-v3-2/operations/5d9869604be85dee480c8750
*
* @param array $data Response body of the read result.
* @return array
*/
private function update_status( array $data ): array {
update_post_meta( $this->attachment_id, '_classifai_azure_read_status', $data );
return $data;
}
}