<?php
namespace Classifai;
/**
* Normalize takes the post_content within a post and cleans it up for
* sending to various APIs. Shortcodes, appreviations, HTML tags
* are all stripped out here.
*
* A 'classifai_normalize' filter is provided to extend this to add
* metadata or to perform additional cleanup.
*/
class Normalizer {
/**
* Creates a plain text normalized version of the post's content.
*
* The post title is also included in the content to improve
* accuracy.
*
* @param int $post_id The post to normalize
* @param string $post_content The post content to normalize
* @return string
*/
public function normalize( $post_id, $post_content = '' ) {
$post = get_post( $post_id );
$post_content = empty( $post_content ) ? apply_filters( 'the_content', $post->post_content ) : $post_content;
$post_title = apply_filters( 'the_title', $post->post_title );
/* Strip shortcodes but keep internal caption text */
$post_content = preg_replace( '#\[.+\](.+)\[/.+\]#', '$1', $post_content );
$post_content = $this->normalize_content( $post_content, $post_title, $post_id );
return $post_content;
}
/**
* Normalizes post_content into plain text.
*
* @param string $post_content The post content.
* @param string $post_title The post title. Optional: append to content to improve accuracy.
* @param int $post_id The post id. Optional.
*/
public function normalize_content( $post_content, $post_title = '', $post_id = false ) {
/**
* Hook to filter post content before stripping HTML tags.
*
* @since 3.1.0
* @hook classifai_pre_normalize
*
* @param {string} $post_content The post content.
*
* @return {string} The filtered Post content.
*/
$post_content = apply_filters( 'classifai_pre_normalize', $post_content );
/* Strip HTML entities */
$post_content = preg_replace( '/&#?[a-z0-9]{2,8};/i', '', $post_content );
/* Replace HTML linebreaks with newlines */
$post_content = preg_replace( '#<br\s?/?>#', "\n\n", $post_content );
/* Strip all HTML tags */
$post_content = wp_strip_all_tags( $post_content );
if ( ! empty( $post_title ) ) {
/* Include title to improve relevancy */
$post_content = $post_title . ".\n\n" . $post_content;
}
/**
* Filters the normalized content to allow for additional cleanup.
*
* @since 0.1.0
* @hook classifai_normalize
*
* @param {string} $post_content The normalized post content.
* @param {int} $post_id The ID of the post whose content is being normalized.
*
* @return {string} The filtered normalized post content.
*/
$post_content = apply_filters( 'classifai_normalize', trim( $post_content ), $post_id );
return $post_content;
}
}