5

To make it easy to visualize, below is the following Record lookup table.

I just can't seem to find anywhere online where it tells you which of these are supposed to also contain charset=utf-8.

Should I just assume it's anything similar to text?

Take a look:

const MEDIA_TYPES: Record<string, string> = {
  ".md": "text/markdown",
  ".html": "text/html",
  ".htm": "text/html",
  ".json": "application/json",
  ".map": "application/json",
  ".txt": "text/plain",
  ".ts": "text/typescript",
  ".tsx": "text/tsx",
  ".js": "application/javascript",
  ".jsx": "text/jsx",
  ".gz": "application/gzip",
  ".css": "text/css",
  ".wasm": "application/wasm",
  ".mjs": "application/javascript",
  ".otf": "font/otf",
  ".ttf": "font/ttf",
  ".woff": "font/woff",
  ".woff2": "font/woff2",
  ".conf": "text/plain",
  ".list": "text/plain",
  ".log": "text/plain",
  ".ini": "text/plain",
  ".vtt": "text/vtt",
  ".yaml": "text/yaml",
  ".yml": "text/yaml",
  ".mid": "audio/midi",
  ".midi": "audio/midi",
  ".mp3": "audio/mp3",
  ".mp4a": "audio/mp4",
  ".m4a": "audio/mp4",
  ".ogg": "audio/ogg",
  ".spx": "audio/ogg",
  ".opus": "audio/ogg",
  ".wav": "audio/wav",
  ".webm": "audio/webm",
  ".aac": "audio/x-aac",
  ".flac": "audio/x-flac",
  ".mp4": "video/mp4",
  ".mp4v": "video/mp4",
  ".mkv": "video/x-matroska",
  ".mov": "video/quicktime",
  ".svg": "image/svg+xml",
  ".avif": "image/avif",
  ".bmp": "image/bmp",
  ".gif": "image/gif",
  ".heic": "image/heic",
  ".heif": "image/heif",
  ".jpeg": "image/jpeg",
  ".jpg": "image/jpeg",
  ".png": "image/png",
  ".tiff": "image/tiff",
  ".psd": "image/vnd.adobe.photoshop",
  ".ico": "image/vnd.microsoft.icon",
  ".webp": "image/webp",
  ".es": "application/ecmascript",
  ".epub": "application/epub+zip",
  ".jar": "application/java-archive",
  ".war": "application/java-archive",
  ".webmanifest": "application/manifest+json",
  ".doc": "application/msword",
  ".dot": "application/msword",
  ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
  ".dotx": "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
  ".cjs": "application/node",
  ".bin": "application/octet-stream",
  ".pkg": "application/octet-stream",
  ".dump": "application/octet-stream",
  ".exe": "application/octet-stream",
  ".deploy": "application/octet-stream",
  ".img": "application/octet-stream",
  ".msi": "application/octet-stream",
  ".pdf": "application/pdf",
  ".pgp": "application/pgp-encrypted",
  ".asc": "application/pgp-signature",
  ".sig": "application/pgp-signature",
  ".ai": "application/postscript",
  ".eps": "application/postscript",
  ".ps": "application/postscript",
  ".rdf": "application/rdf+xml",
  ".rss": "application/rss+xml",
  ".rtf": "application/rtf",
  ".apk": "application/vnd.android.package-archive",
  ".key": "application/vnd.apple.keynote",
  ".numbers": "application/vnd.apple.keynote",
  ".pages": "application/vnd.apple.pages",
  ".geo": "application/vnd.dynageo",
  ".gdoc": "application/vnd.google-apps.document",
  ".gslides": "application/vnd.google-apps.presentation",
  ".gsheet": "application/vnd.google-apps.spreadsheet",
  ".kml": "application/vnd.google-earth.kml+xml",
  ".mkz": "application/vnd.google-earth.kmz",
  ".icc": "application/vnd.iccprofile",
  ".icm": "application/vnd.iccprofile",
  ".xls": "application/vnd.ms-excel",
  ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  ".xlm": "application/vnd.ms-excel",
  ".ppt": "application/vnd.ms-powerpoint",
  ".pot": "application/vnd.ms-powerpoint",
  ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
  ".potx": "application/vnd.openxmlformats-officedocument.presentationml.template",
  ".xps": "application/vnd.ms-xpsdocument",
  ".odc": "application/vnd.oasis.opendocument.chart",
  ".odb": "application/vnd.oasis.opendocument.database",
  ".odf": "application/vnd.oasis.opendocument.formula",
  ".odg": "application/vnd.oasis.opendocument.graphics",
  ".odp": "application/vnd.oasis.opendocument.presentation",
  ".ods": "application/vnd.oasis.opendocument.spreadsheet",
  ".odt": "application/vnd.oasis.opendocument.text",
  ".rar": "application/vnd.rar",
  ".unityweb": "application/vnd.unity",
  ".dmg": "application/x-apple-diskimage",
  ".bz": "application/x-bzip",
  ".crx": "application/x-chrome-extension",
  ".deb": "application/x-debian-package",
  ".php": "application/x-httpd-php",
  ".iso": "application/x-iso9660-image",
  ".sh": "application/x-sh",
  ".sql": "application/x-sql",
  ".srt": "application/x-subrip",
  ".xml": "application/xml",
  ".zip": "application/zip",
};

/** Returns the content-type based on the extension of a path. */
function contentType(pathname: string): string | undefined {
  return MEDIA_TYPES[pathname];
}

console.log(contentType("/dashboard/v1/index.html"));
console.log(contentType("/dashboard/v1/logo.svg"));
suchislife
  • 4,251
  • 10
  • 47
  • 78
  • 2
    What do you mean by "*are supposed to also contain `charset=utf-8`*"? The charset (encoding) does not depend on, and cannot be derived from, the file extension. – Bergi Jan 09 '22 at 19:10
  • I meant which "known file extensions" whose content type can usually be predicted provided someone isn't just mislabeling file extensions for the fun of it. – suchislife Jan 09 '22 at 19:16
  • 2
    No, I meant what I said: you cannot predict the charset from the file extension. A `.html` file should have html content (unless mislabelled), but you can't tell from the file extension whether that html content is encoded as UTF-8 or something else. – Bergi Jan 09 '22 at 19:19
  • Ohhhhhhhhhhhhhh. I see your point. Perhaps determine it based on http header `'Content-Language': 'en-US',`. Hmmm. Excellent point. – suchislife Jan 09 '22 at 19:20
  • I doesn't depend on the language either. What exactly is your use case, what do you need this for? – Bergi Jan 09 '22 at 19:21
  • A static file server. And right again. Language isn't ultimately ideal. – suchislife Jan 09 '22 at 19:22
  • 1
    So you read the files from disk and serve them via HTTP, and want to add the appropriate content-type header? Ok. Who stores the files on the disk, and what encoding do they use for text files? – Bergi Jan 09 '22 at 19:24
  • The user stores files on the disk. The encoding they use is... Oh. I see what you did there. The idea is, you can run this script and serve a directory with sub-directories as a file server. – suchislife Jan 09 '22 at 19:28
  • Actually I wanted you to answer "*they always use utf-8 because that's the best practice*" :-) (If that were the case, you could and should just hardcode that value). But if they don't, yes, you're in trouble. You can try a heuristic (see e.g. https://chardet.readthedocs.io/en/latest/faq.html#what-is-character-encoding-auto-detection, https://superuser.com/questions/301552/how-to-auto-detect-text-file-encoding) but it won't work 100% of the time. I'd rather recommend to omit the `charset` parameter altogether if you don't know, and let the browser handle it. – Bergi Jan 09 '22 at 19:34
  • And to allow the browser to handle it, I must NOT use http header `X-Content-Type-Options: nosniff`, correct? – suchislife Jan 09 '22 at 19:46
  • I think since you *are* providing a content-type (just one without a charset), it wouldn't be sniffed anyway. Encoding autodetection is something separate I think. – Bergi Jan 09 '22 at 19:54

1 Answers1

4

MDN Says:

For example, for any MIME type whose main type is text, you can add the optional charset parameter to specify the character set used for the characters in the data. If no charset is specified, the default is ASCII (US-ASCII) unless overridden by the user agent's settings. To specify a UTF-8 text file, the MIME type text/plain;charset=UTF-8 is used.

So, for anything based on text/... you can optionally add the charset.

https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types#structure_of_a_mime_type

The following update to contentType() function demonstrates one solution.

/** Returns the content-type based on the extension of a path. */
function contentType(ext: string): string | undefined {

  const charsetUtf8 = "; charset=UTF-8";

  // Content types that do not start with text/.. but usually contain charset=utf-8
  const specialCase = [
    "application/json",
    "application/xml",
    "application/rss+xml",
  ];

  let outputContentType = MEDIA_TYPES[ext];

  // Return undefined.
  if(!outputContentType) return;

  if(outputContentType.startsWith("text/") || specialCase.includes(outputContentType)) {

    // Combine Content-Type with charset=utf-8
    outputContentType = outputContentType + charsetUtf8;

  // Return combined.
    return outputContentType;

  } else {
    // Return for any other types or undefined.
    return outputContentType;
  }
  
}

Then theoretically, you could return a 422: Unprocessable Entity on any content type NOT present in the mime type list.

let requestedFilePath = path.join("./", context.main.http.server.root, urlPathNameParsed.dir, urlPathNameParsed.base);

try {
  // try as file...
  const file = await Deno.readFile(requestedFilePath);

  // If file extension is not on media list...
  if(typeof(contentType(urlPathNameParsed.ext)) === 'undefined') { 

    // Set Content-Type header for this response
    responseHeaders.set('Content-Type', 'text/plain; charset=UTF-8');

    await e.respondWith(
      new Response("HTTP 422: Unprocessable Entity", {
        headers: responseHeaders,
        status: 422,
        statusText: 'Unprocessable Entity',
      })
    );

  } else {

    // Set Content-Type header for this response
    responseHeaders.set('Content-Type', `${contentType(urlPathNameParsed.ext)}`);

    // Build and send the response
    await e.respondWith(
      new Response(file, {
        headers: responseHeaders, 
        status: 200, 
        statusText: 'OK'
      })
    );

  }
suchislife
  • 4,251
  • 10
  • 47
  • 78
Luca Kiebel
  • 9,790
  • 7
  • 29
  • 44
  • 1
    THAT, clarifies it. So much easier. Thank you! – suchislife Jan 09 '22 at 16:45
  • Although... `application/json` usually contains it too. – suchislife Jan 09 '22 at 18:29
  • 1
    https://stackoverflow.com/questions/9254891/what-does-content-type-application-json-charset-utf-8-really-mean/9254967 – Luca Kiebel Jan 09 '22 at 18:30
  • 1
    Not sure how to reformulate this, but that doesn't mean that **only** the ones that start with `text/` may need this. As Op already stated a few `application/` like `application/json` or `application/ecmascript` and `application/xml`, along with `image/svg+xml` and maybe others I'm not aware of or don't remember. The rule of thumb is then more "anything that is consumed as text". Also to be noted that some UAs (looking at you Safari) do choke on the presence of the `charset` header in some cases (like when generating an svg Blob and trying to display it). – Kaiido Jan 10 '22 at 01:44
  • 1
    Note that `application/json` should not contain the `charset` type. When decoding json, you need to follow the steps inside the JSON standard for automatic charset detection of he small list of allowed types – Ferrybig Apr 07 '22 at 08:01