Fix substituting smilies and smilies containing whitespaces
authorgudzpoz <gudzpoz@live.com>
Sat, 25 Nov 2023 15:29:39 +0000 (23:29 +0800)
committergudzpoz <gudzpoz@live.com>
Sat, 25 Nov 2023 15:29:39 +0000 (23:29 +0800)
src/Content/Smilies.php
tests/Util/SmileyWhitespaceAddon.php [new file with mode: 0644]
tests/datasets/api.fixture.php
tests/src/Content/SmiliesTest.php
tests/src/Factory/Api/Mastodon/StatusTest.php
tests/src/Protocol/ActivityPub/TransmitterTest.php

index 9c357a9..01ac832 100644 (file)
@@ -153,43 +153,6 @@ class Smilies
                return $params;
        }
 
-       /**
-        * Normalizes smiley shortcodes into texts with no special symbols.
-        *
-        * @return array
-        *    'texts' => smilie shortcut
-        *    'icons' => icon url or an empty string
-        *    'norms' => normalized shortcut
-        */
-       public static function getNormalizedList(): array
-       {
-               $smilies = self::getList();
-               $norms = [];
-               $icons = $smilies['icons'];
-               foreach ($smilies['texts'] as $i => $shortcode) {
-                       // Extract urls
-                       $icon = $icons[$i];
-                       if (preg_match('/src="(.+?)"/', $icon, $match)) {
-                               $icon = $match[1];
-                       } else {
-                               $icon = '';
-                       }
-                       $icons[$i] = $icon;
-
-                       // Normalize name
-                       $norm = preg_replace('/[\s\-:#~]/', '', $shortcode);
-                       if (ctype_alnum($norm)) {
-                               $norms[] = $norm;
-                       } elseif (preg_match('#/smiley-(\w+)\.gif#', $icon, $match)) {
-                               $norms[] = $match[1];
-                       } else {
-                               $norms[] = 'smiley' . $i;
-                       }
-               }
-               $smilies['norms'] = $norms;
-               return $smilies;
-       }
-
        /**
         * Finds all used smilies (denoted by quoting colons like :heart:) in the provided text and normalizes their usages.
         *
@@ -206,18 +169,36 @@ class Smilies
                                if (strpos($text, '[nosmile]') !== false || self::noSmilies()) {
                                        return $text;
                                }
-                               $smilies = self::getNormalizedList();
-                               $normalized = array_combine($smilies['texts'], $smilies['norms']);
+                               $smilies = self::getList();
+                               $normalized = [];
                                return self::performForEachWordMatch(
                                        array_combine($smilies['texts'], $smilies['icons']),
                                        $text,
                                        function (string $name, string $image) use($normalized, &$emojis) {
-                                               $name = $normalized[$name];
+                                               if (array_key_exists($name, $normalized)) {
+                                                       return $normalized[$name];
+                                               }
                                                if (preg_match('/src="(.+?)"/', $image, $match)) {
-                                                       $image = $match[1];
-                                                       $emojis[$name] = $image;
+                                                       $url = $match[1];
+                                                       // Image smilies, which should be normalized instead of being embedded for some protocols like ActivityPub.
+                                                       // Normalize name
+                                                       $norm = preg_replace('/[\s\-:#~]/', '', $name);
+                                                       if (!ctype_alnum($norm)) {
+                                                               if (preg_match('#/smiley-(\w+)\.gif#', $url, $match)) {
+                                                                       $norm = $match[1];
+                                                               } else {
+                                                                       $norm = 'smiley' . count($normalized);
+                                                               }
+                                                       }
+                                                       $shortcode = ':' . $norm . ':';
+                                                       $normalized[$name] = $shortcode;
+                                                       $emojis[$norm] = $url;
+                                                       return $shortcode;
+                                               } else {
+                                                       $normalized[$name] = $image;
+                                                       // Probably text-substitution smilies (e.g., Unicode ones).
+                                                       return $image;
                                                }
-                                               return ':' . $name . ':';
                                        },
                                );
                        });
@@ -240,11 +221,15 @@ class Smilies
                $ord2_bitset = 0;
                $prefixes = [];
                foreach ($words as $word => $_) {
-                       if (strlen($word) < 2 || !ctype_graph($word)) {
+                       if (strlen($word) < 2) {
                                continue;
                        }
                        $ord1 = ord($word);
                        $ord2 = ord($word[1]);
+                       // A smiley shortcode must not begin or end with whitespaces.
+                       if (ctype_space($ord1) || ctype_space($word[strlen($word) - 1])) {
+                               continue;
+                       }
                        $ord1_bitset |= 1 << ($ord1 & 31);
                        $ord2_bitset |= 1 << ($ord2 & 31);
                        if (!array_key_exists($word[0], $prefixes)) {
@@ -253,52 +238,37 @@ class Smilies
                        $prefixes[$word[0]][] = $word;
                }
 
+               $slength = strlen($subject);
                $result = '';
+               // $processed is used to delay string concatenation since appending a char every loop is inefficient.
                $processed = 0;
-               $s_start = 0; // Segment start
-               // No spaces are allowed in smilies, so they can serve as delimiters.
-               // Splitting by some delimiters may not necessary though?
-               while (true) {
-                       if ($s_start >= strlen($subject)) {
-                               $result .= substr($subject, $processed);
-                               break;
-                       }
-                       if (preg_match('/\s+?(?=\S|$)/', $subject, $match, PREG_OFFSET_CAPTURE, $s_start)) {
-                               [$whitespaces, $s_end] = $match[0];
-                       } else {
-                               $s_end = strlen($subject);
-                               $whitespaces = '';
-                       }
-                       $s_length = $s_end - $s_start;
-                       if ($s_length > 1) {
-                               $segment = substr($subject, $s_start, $s_length);
-                               // Find possible starting points for smilies.
-                               // For built-in smilies, the two bitsets should make attempts quite efficient.
-                               // However, presuming custom smilies follow the format of ":shortcode" or ":shortcode:",
-                               // if the user adds more smilies (with addons), the second bitset may eventually become useless.
-                               for ($i = 0; $i < $s_length - 1; $i++) {
-                                       $c = $segment[$i];
-                                       $d = $segment[$i + 1];
-                                       if (($ord1_bitset & (1 << (ord($c) & 31))) && ($ord2_bitset & (1 << (ord($d) & 31))) && array_key_exists($c, $prefixes)) {
-                                               foreach ($prefixes[$c] as $word) {
-                                                       $wlength = strlen($word);
-                                                       if ($wlength <= $s_length - $i && substr($segment, $i, $wlength) === $word) {
-                                                               // Check for boundaries
-                                                               if (($i === 0 || ctype_space($segment[$i - 1]) || ctype_punct($segment[$i - 1]))
-                                                                       && ($i + $wlength >= $s_length || ctype_space($segment[$i + $wlength]) || ctype_punct($segment[$i + $wlength]))) {
-                                                                       $result .= substr($subject, $processed, $s_start - $processed + $i);
-                                                                       $result .= call_user_func($callback, $word, $words[$word]);
-                                                                       $i += $wlength;
-                                                                       $processed = $s_start + $i;
-                                                                       $i--;
-                                                                       break;
-                                                               }
-                                                       }
+               // Find possible starting points for smilies.
+               // For built-in smilies, the two bitsets should make attempts quite efficient.
+               // However, presuming custom smilies follow the format of ":shortcode" or ":shortcode:",
+               // if the user adds more smilies (with addons), the second bitset may eventually become useless.
+               for ($i = 0; $i < $slength - 1; $i++) {
+                       $c = $subject[$i];
+                       $d = $subject[$i + 1];
+                       if (($ord1_bitset & (1 << (ord($c) & 31))) && ($ord2_bitset & (1 << (ord($d) & 31))) && array_key_exists($c, $prefixes)) {
+                               foreach ($prefixes[$c] as $word) {
+                                       $wlength = strlen($word);
+                                       if (substr($subject, $i, $wlength) === $word) {
+                                               // Check for boundaries
+                                               if (($i === 0 || ctype_space($subject[$i - 1]) || ctype_punct($subject[$i - 1]))
+                                                       && ($i + $wlength >= $slength || ctype_space($subject[$i + $wlength]) || ctype_punct($subject[$i + $wlength]))) {
+                                                       $result .= substr($subject, $processed, $i - $processed);
+                                                       $result .= call_user_func($callback, $word, $words[$word]);
+                                                       $i += $wlength;
+                                                       $processed = $i;
+                                                       $i--;
+                                                       break;
                                                }
                                        }
                                }
                        }
-                       $s_start = $s_end + strlen($whitespaces);
+               }
+               if ($processed < $slength) {
+                       $result .= substr($subject, $processed);
                }
                return $result;
        }
diff --git a/tests/Util/SmileyWhitespaceAddon.php b/tests/Util/SmileyWhitespaceAddon.php
new file mode 100644 (file)
index 0000000..5277d3d
--- /dev/null
@@ -0,0 +1,36 @@
+<?php
+/**
+ * @copyright Copyright (C) 2010-2023, the Friendica project
+ *
+ * @license GNU AGPL version 3 or any later version
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ */
+
+use Friendica\Content\Smilies;
+
+function add_test_unicode_smilies(array &$b)
+{
+       // String-substitution smilies
+       // - no whitespaces
+       Smilies::add($b, '⽕', '&#x1F525;');
+       // - with whitespaces
+       Smilies::add($b, ':hugging face:', '&#x1F917;');
+       // - with multiple whitespaces
+       Smilies::add($b, ':face with hand over mouth:', '&#x1F92D;');
+       // Image-based smilies
+       // - with whitespaces
+       Smilies::add($b, ':smiley heart 333:', '<img class="smiley" src="/images/smiley-heart.gif" alt="smiley-heart" title="smiley-heart" />');
+}
index 2bf38a5..1d2bb5f 100644 (file)
@@ -371,7 +371,7 @@ return [
                [
                        'uri-id' => 100,
                        'title'  => 'item_title',
-                       'body'   => ':like ~friendica no [code]:dislike[/code] :-p :-[ <3',
+                       'body'   => ':like ~friendica no [code]:dislike[/code] :-p :-[ :hugging face: <3 :smiley heart 333: ⽕',
                        'plink'  => 'https://friendica.local/post/100',
                ],
        ],
index e41e59c..820e378 100644 (file)
@@ -26,6 +26,7 @@
 namespace Friendica\Test\src\Content;
 
 use Friendica\Content\Smilies;
+use Friendica\Core\Hook;
 use Friendica\DI;
 use Friendica\Network\HTTPException\InternalServerErrorException;
 use Friendica\Test\FixtureTest;
@@ -37,6 +38,9 @@ class SmiliesTest extends FixtureTest
                parent::setUp();
 
                DI::config()->set('system', 'no_smilies', false);
+
+               Hook::register('smilie', 'tests/Util/SmileyWhitespaceAddon.php', 'add_test_unicode_smilies');
+               Hook::loadHooks();
        }
 
        public function dataLinks()
@@ -184,6 +188,26 @@ class SmiliesTest extends FixtureTest
                                'expected' => '(3&lt;33)',
                                'body' => '(3&lt;33)',
                        ],
+                       'space' => [
+                               'expected' => 'alt="smiley-heart"',
+                               'body' => ':smiley heart 333:',
+                       ],
+                       'substitution-1' => [
+                               'expected' => '&#x1F525;',
+                               'body' => '⽕',
+                       ],
+                       'substitution-2' => [
+                               'expected' => '&#x1F917;',
+                               'body' => ':hugging face:',
+                       ],
+                       'substitution-3' => [
+                               'expected' => '&#x1F92D;',
+                               'body' => ':face with hand over mouth:',
+                       ],
+                       'mixed' => [
+                               'expected' => '&#x1F525; &#x1F92D; invalid:hugging face: &#x1F917;',
+                               'body' => '⽕ :face with hand over mouth: invalid:hugging face: :hugging face:',
+                       ],
                ];
                foreach ([':-[', ':-D', 'o.O'] as $emoji) {
                        foreach (['A', '_', ':', '-'] as $prefix) {
@@ -245,6 +269,31 @@ class SmiliesTest extends FixtureTest
                                'body' => '~friendica',
                                'normalized' => ':friendica:'
                        ],
+                       'space' => [
+                               'expected' => ['smileyheart333'],
+                               'body' => ':smiley heart 333:',
+                               'normalized' => ':smileyheart333:'
+                       ],
+                       'substitution-1' => [
+                               'expected' => [],
+                               'body' => '⽕',
+                               'normalized' => '&#x1F525;',
+                       ],
+                       'substitution-2' => [
+                               'expected' => [],
+                               'body' => ':hugging face:',
+                               'normalized' => '&#x1F917;',
+                       ],
+                       'substitution-3' => [
+                               'expected' => [],
+                               'body' => ':face with hand over mouth:',
+                               'normalized' => '&#x1F92D;',
+                       ],
+                       'mixed' => [
+                               'expected' => [],
+                               'body' => '⽕ :face with hand over mouth: invalid:hugging face: :hugging face:',
+                               'normalized' => '&#x1F525; &#x1F92D; invalid:hugging face: &#x1F917;',
+                       ],
                ];
        }
 
index d150d85..df702fa 100644 (file)
@@ -21,8 +21,9 @@
 
 namespace Friendica\Test\src\Factory\Api\Mastodon;
 
-use Friendica\Model\Post;
+use Friendica\Core\Hook;
 use Friendica\DI;
+use Friendica\Model\Post;
 use Friendica\Test\FixtureTest;
 
 class StatusTest extends FixtureTest
@@ -35,6 +36,9 @@ class StatusTest extends FixtureTest
 
                DI::config()->set('system', 'no_smilies', false);
                $this->status = DI::mstdnStatus();
+
+               Hook::register('smilie', 'tests/Util/SmileyWhitespaceAddon.php', 'add_test_unicode_smilies');
+               Hook::loadHooks();
        }
 
        public function testSimpleStatus()
@@ -50,8 +54,8 @@ class StatusTest extends FixtureTest
                $post = Post::selectFirst([], ['id' => 14]);
                $this->assertNotNull($post);
                $result = $this->status->createFromUriId($post['uri-id'])->toArray();
-               $this->assertEquals(':like: :friendica: no <code>:dislike</code> :p: :embarrassed: ', $result['content']);
-               $emojis = array_fill_keys(['like', 'friendica', 'p', 'embarrassed'], true);
+               $this->assertEquals(':like: :friendica: no <code>:dislike</code> :p: :embarrassed: 🤗 ❤ :smileyheart333: 🔥', $result['content']);
+               $emojis = array_fill_keys(['like', 'friendica', 'p', 'embarrassed', 'smileyheart333'], true);
                $this->assertEquals(count($emojis), count($result['emojis']));
                foreach ($result['emojis'] as $emoji) {
                        $this->assertTrue(array_key_exists($emoji['shortcode'], $emojis));
index 49b51da..3eb9cb0 100644 (file)
@@ -21,6 +21,7 @@
 
 namespace Friendica\Test\src\Protocol\ActivityPub;
 
+use Friendica\Core\Hook;
 use Friendica\DI;
 use Friendica\Model\Post;
 use Friendica\Protocol\ActivityPub\Transmitter;
@@ -33,6 +34,9 @@ class TransmitterTest extends FixtureTest
                parent::setUp();
 
                DI::config()->set('system', 'no_smilies', false);
+
+               Hook::register('smilie', 'tests/Util/SmileyWhitespaceAddon.php', 'add_test_unicode_smilies');
+               Hook::loadHooks();
        }
 
        public function testEmojiPost()
@@ -42,8 +46,8 @@ class TransmitterTest extends FixtureTest
                $note = Transmitter::createNote($post);
                $this->assertNotNull($note);
 
-               $this->assertEquals(':like: :friendica: no <code>:dislike</code> :p: :embarrassed: ', $note['content']);
-               $emojis = array_fill_keys(['like', 'friendica', 'p', 'embarrassed'], true);
+               $this->assertEquals(':like: :friendica: no <code>:dislike</code> :p: :embarrassed: 🤗 ❤ :smileyheart333: 🔥', $note['content']);
+               $emojis = array_fill_keys(['like', 'friendica', 'p', 'embarrassed', 'smileyheart333'], true);
                $this->assertEquals(count($emojis), count($note['tag']));
                foreach ($note['tag'] as $emoji) {
                        $this->assertTrue(array_key_exists($emoji['name'], $emojis));