<?php
/**
* Provides OCR detection with the AI Vision service.
*
* @since 1.6.0
* @package Classifai
*/
namespace Classifai\Providers\Azure;
use WP_Error;
use function Classifai\computer_vision_max_filesize;
use function Classifai\get_largest_size_and_dimensions_image_url;
/**
* OCR class
*
* Connects to AI Vision's ocr endpoint to detect text.
*
* @see https://docs.microsoft.com/en-us/rest/api/cognitiveservices/computervision/recognizeprintedtext/
* @since 1.6.0
*/
class OCR {
/**
* The AI Vision API path to the OCR service.
*
* @since 1.6.0
*
* @var string
*/
const API_PATH = 'vision/v3.2/ocr/';
/**
* ComputerVision settings.
*
* @since 1.6.0
*
* @var array
*/
private $settings;
/**
* Image scan results.
*
* @since 1.6.0
*
* @var bool|object
*/
private $scan;
/**
* Media types to process.
*
* @since 1.6.0
*
* @var array
*/
private $media_to_process = [
'bmp',
'gif',
'jpeg',
'png',
];
/**
* OCR constructor.
*
* @since 1.6.0
*
* @param array $settings Computer Vision settings.
* @param bool|object $scan Previously run image scan.
*/
public function __construct( array $settings, $scan ) {
$this->settings = $settings;
$this->scan = $scan;
}
/**
* Builds the API url.
*
* @since 1.6.0
*
* @return string
*/
public function get_api_url(): string {
return sprintf( '%s%s', trailingslashit( $this->settings['endpoint_url'] ), static::API_PATH );
}
/**
* Returns whether OCR processing should be applied to the attachment
*
* @since 1.6.0
*
* @param int $attachment_id Attachment ID.
* @return bool
*/
public function should_process( int $attachment_id ): bool {
$mime_type = get_post_mime_type( $attachment_id );
$matched_extensions = explode( '|', array_search( $mime_type, wp_get_mime_types(), true ) );
$process = false;
/**
* Filters the media types that should be processed
*
* @since 1.6.0
* @hook classifai_ocr_approved_media_types
*
* @param {array} $media_types The media types to process.
* @param {int} $attachment_id The attachment ID.
*
* @return {array} Filtered media types.
*/
$approved_media_types = apply_filters( 'classifai_ocr_approved_media_types', $this->media_to_process, $attachment_id );
foreach ( $matched_extensions as $ext ) {
if ( in_array( $ext, $approved_media_types, true ) ) {
$process = true;
}
}
// If we have a proper image and a previous image scan, check
// to see if we have proper tags set, with a high confidence
if ( $process && $this->scan && ! empty( $this->scan->tags ) && is_array( $this->scan->tags ) ) {
/**
* Filters the tags we check for OCR processing
*
* @since 1.6.0
* @hook classifai_ocr_tags
*
* @param {array} $tags Tags to look for. Default handwriting and text.
* @param {int} $attachment_id The attachment ID.
* @param {bool|object} $scan Previously run scan.
*
* @return {array} Filtered tags.
*/
$tags = apply_filters( 'classifai_ocr_tags', [ 'handwriting', 'text' ], $attachment_id, $this->scan );
/**
* Filters the tag confidence level for OCR processing
*
* @since 1.6.0
* @hook classifai_ocr_tag_confidence
*
* @param {int} $confidence The minimum confidence level. Default 90.
* @param {int} $attachment_id The attachment ID.
* @param {bool|object} $scan Previously run scan.
*
* @return {int} Confidence level.
*/
$tag_confidence = apply_filters( 'classifai_ocr_tag_confidence', 90, $attachment_id, $this->scan );
foreach ( $this->scan->tags as $tag ) {
if ( in_array( $tag->name, $tags, true ) && $tag->confidence * 100 >= $tag_confidence ) {
$process = true;
break;
}
}
}
/**
* Filters whether to run OCR processing on this media item
*
* @since 1.6.0
* @hook classifai_ocr_should_process
*
* @param {bool} $process Whether to run OCR processing or not.
* @param {int} $attachment_id The attachment ID.
* @param {bool|object} $scan Previously run scan.
*
* @return {bool} Whether this attachment should have OCR processing.
*/
return apply_filters( 'classifai_ocr_should_process', $process, $attachment_id, $this->scan );
}
/**
* Get the OCR data
*
* @since 1.6.0
*
* @param array $metadata Attachment metadata.
* @param integer $attachment_id Attachment ID.
* @return string|WP_Error
*/
public function generate_ocr_data( array $metadata, int $attachment_id ) {
$rtn = '';
if ( ! $this->should_process( $attachment_id ) ) {
return new WP_Error( 'process_error', esc_html__( 'Image does not meet processing requirements.', 'classifai' ), $metadata );
}
$url = get_largest_size_and_dimensions_image_url(
get_attached_file( $attachment_id ),
wp_get_attachment_url( $attachment_id, 'full' ),
$metadata,
[ 50, 4200 ],
[ 50, 4200 ],
computer_vision_max_filesize()
);
// If a properly sized image isn't found, return
if ( ! $url ) {
return new WP_Error( 'size_error', esc_html__( 'Image does not meet size requirements. Please ensure it is at least 50x50 but less than 4200x4200 and smaller than 4MB.', 'classifai' ), $metadata );
}
$scan = $this->process( $url );
set_transient( 'classifai_azure_computer_vision_ocr_latest_response', $scan, DAY_IN_SECONDS * 30 );
if ( ! is_wp_error( $scan ) && isset( $scan->regions ) ) {
$text = [];
// Iterate down the chain to find the text we want
foreach ( $scan->regions as $region ) {
foreach ( $region->lines as $lines ) {
foreach ( $lines->words as $word ) {
if ( isset( $word->text ) ) {
$text[] = $word->text;
}
}
}
}
if ( ! empty( $text ) ) {
/**
* Filter the text returned from the API.
*
* @since 1.6.0
* @hook classifai_ocr_text
*
* @param {string} $text The returned text data.
* @param {object} $scan The full scan results from the API.
*
* @return {string} The filtered text data.
*/
$rtn = apply_filters( 'classifai_ocr_text', implode( ' ', $text ), $scan );
// Save all the results for later
update_post_meta( $attachment_id, 'classifai_computer_vision_ocr', $scan );
}
} else {
$rtn = $scan;
}
return $rtn;
}
/**
* Run OCR processing using the Azure API
*
* @since 1.6.0
*
* @param string $url Media URL.
* @return object|WP_Error
*/
public function process( string $url ) {
// Check if valid authentication is in place.
if ( empty( $this->settings ) || ( isset( $this->settings['authenticated'] ) && false === $this->settings['authenticated'] ) ) {
return new WP_Error( 'auth', esc_html__( 'Please set up valid authentication with Azure.', 'classifai' ) );
}
$response = wp_remote_post(
$this->get_api_url(),
[
'body' => wp_json_encode(
[
'url' => $url,
]
),
'headers' => [
'Content-Type' => 'application/json',
'Ocp-Apim-Subscription-Key' => $this->settings['api_key'],
],
]
);
/**
* Fires after the request to the ocr endpoint has run.
*
* @since 1.6.0
* @hook classifai_ocr_after_request
*
* @param {array|WP_Error} Response data or a WP_Error if the request failed.
* @param {string} The attachment URL.
*/
do_action( 'classifai_ocr_after_request', $response, $url );
if ( ! is_wp_error( $response ) ) {
$body = json_decode( wp_remote_retrieve_body( $response ) );
if ( isset( $body->message ) ) {
$error_message = $body->message;
} elseif ( isset( $body->error->message ) ) {
$error_message = $body->error->message;
} else {
$error_message = false;
}
if ( 200 !== wp_remote_retrieve_response_code( $response ) && $error_message ) {
/**
* Fires when the ocr API response did not succeed.
*
* @since 1.6.0
* @hook classifai_ocr_unsuccessful_response
*
* @param {array|WP_Error} Response data or a WP_Error if the request failed.
* @param {string} The attachment URL.
*/
do_action( 'classifai_ocr_unsuccessful_response', $response, $url );
$rtn = new WP_Error( $body->code ?? 'error', $error_message, $body );
} else {
$rtn = $body;
}
} else {
$rtn = $response;
}
return $rtn;
}
}