Source: Normalizer.php

  1. <?php
  2. namespace Classifai;
  3. /**
  4. * Normalize takes the post_content within a post and cleans it up for
  5. * sending to various APIs. Shortcodes, appreviations, HTML tags
  6. * are all stripped out here.
  7. *
  8. * A 'classifai_normalize' filter is provided to extend this to add
  9. * metadata or to perform additional cleanup.
  10. */
  11. class Normalizer {
  12. /**
  13. * Creates a plain text normalized version of the post's content.
  14. *
  15. * The post title is also included in the content to improve
  16. * accuracy.
  17. *
  18. * @param int $post_id The post to normalize
  19. * @param string $post_content The post content to normalize
  20. * @return string
  21. */
  22. public function normalize( $post_id, $post_content = '' ) {
  23. $post = get_post( $post_id );
  24. $post_content = empty( $post_content ) ? apply_filters( 'the_content', $post->post_content ) : $post_content;
  25. $post_title = apply_filters( 'the_title', $post->post_title );
  26. /* Strip shortcodes but keep internal caption text */
  27. $post_content = preg_replace( '#\[.+\](.+)\[/.+\]#', '$1', $post_content );
  28. $post_content = $this->normalize_content( $post_content, $post_title, $post_id );
  29. return $post_content;
  30. }
  31. /**
  32. * Normalizes post_content into plain text.
  33. *
  34. * @param string $post_content The post content.
  35. * @param string $post_title The post title. Optional: append to content to improve accuracy.
  36. * @param int $post_id The post id. Optional.
  37. */
  38. public function normalize_content( $post_content, $post_title = '', $post_id = false ) {
  39. /**
  40. * Hook to filter post content before stripping HTML tags.
  41. *
  42. * @since 3.1.0
  43. * @hook classifai_pre_normalize
  44. *
  45. * @param {string} $post_content The post content.
  46. *
  47. * @return {string} The filtered Post content.
  48. */
  49. $post_content = apply_filters( 'classifai_pre_normalize', $post_content );
  50. /* Strip HTML entities */
  51. $post_content = preg_replace( '/&#?[a-z0-9]{2,8};/i', '', $post_content );
  52. /* Replace HTML linebreaks with newlines */
  53. $post_content = preg_replace( '#<br\s?/?>#', "\n\n", $post_content );
  54. /* Strip all HTML tags */
  55. $post_content = wp_strip_all_tags( $post_content );
  56. if ( ! empty( $post_title ) ) {
  57. /* Include title to improve relevancy */
  58. $post_content = $post_title . ".\n\n" . $post_content;
  59. }
  60. /**
  61. * Filters the normalized content to allow for additional cleanup.
  62. *
  63. * @since 0.1.0
  64. * @hook classifai_normalize
  65. *
  66. * @param {string} $post_content The normalized post content.
  67. * @param {int} $post_id The ID of the post whose content is being normalized.
  68. *
  69. * @return {string} The filtered normalized post content.
  70. */
  71. $post_content = apply_filters( 'classifai_normalize', trim( $post_content ), $post_id );
  72. return $post_content;
  73. }
  74. }