More general content type detection
authorMichael <heluecht@pirati.ca>
Sat, 13 Mar 2021 13:17:42 +0000 (13:17 +0000)
committerMichael <heluecht@pirati.ca>
Sat, 13 Mar 2021 13:17:42 +0000 (13:17 +0000)
src/Util/ParseUrl.php

index 83d0d84..da6c88a 100644 (file)
@@ -54,25 +54,21 @@ class ParseUrl
        /**
         * Fetch the content type of the given url
         * @param string $url URL of the page
-        * @return string content type 
+        * @return array content type 
         */
        public static function getContentType(string $url)
        {
                $curlResult = DI::httpRequest()->head($url);
                if (!$curlResult->isSuccess()) {
-                       return '';
+                       return [];
                }
 
                $contenttype =  $curlResult->getHeader('Content-Type');
                if (empty($contenttype)) {
-                       return '';
-               }
-               
-               if (!preg_match('#(image|video|audio)/#i', $contenttype, $matches)) {
-                       return '';
+                       return [];
                }
 
-               return array_pop($matches);
+               return explode('/', current(explode(';', $contenttype)));
        }
 
        /**
@@ -211,8 +207,14 @@ class ParseUrl
                }
 
                $type = self::getContentType($url);
-               if (in_array($type, ['image', 'video', 'audio'])) {
-                       $siteinfo['type'] = $type;
+               Logger::info('Got content-type', ['content-type' => $type, 'url' => $url]);
+               if (!empty($type) && in_array($type[0], ['image', 'video', 'audio'])) {
+                       $siteinfo['type'] = $type[0];
+                       return $siteinfo;
+               }
+
+               if ((count($type) >= 2) && (($type[0] != 'text') || ($type[1] != 'html'))) {
+                       Logger::info('Unparseable content-type, quitting here, ', ['content-type' => $type, 'url' => $url]);
                        return $siteinfo;
                }
 
@@ -228,21 +230,6 @@ class ParseUrl
                        return $siteinfo;
                }
 
-               // Native media type, no need for HTML parsing
-               $type = $curlResult->getHeader('Content-Type');
-               if ($type) {
-                       preg_match('#(image|video|audio)/#i', $type, $matches);
-                       if ($matches) {
-                               $siteinfo['type'] = array_pop($matches);
-                               return $siteinfo;
-                       }
-               }
-
-               // If it isn't a HTML file then exit
-               if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) {
-                       return $siteinfo;
-               }
-
                if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) {
                        if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) {
                                $maxAge = max(86400, (int)array_pop($matches));