utf8-decoder.js 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. const b4a = require('b4a')
  2. /**
  3. * https://encoding.spec.whatwg.org/#utf-8-decoder
  4. */
  5. module.exports = class UTF8Decoder {
  6. constructor () {
  7. this.codePoint = 0
  8. this.bytesSeen = 0
  9. this.bytesNeeded = 0
  10. this.lowerBoundary = 0x80
  11. this.upperBoundary = 0xbf
  12. }
  13. get remaining () {
  14. return this.bytesSeen
  15. }
  16. decode (data) {
  17. // If we have a fast path, just sniff if the last part is a boundary
  18. if (this.bytesNeeded === 0) {
  19. let isBoundary = true
  20. for (let i = Math.max(0, data.byteLength - 4), n = data.byteLength; i < n && isBoundary; i++) {
  21. isBoundary = data[i] <= 0x7f
  22. }
  23. if (isBoundary) return b4a.toString(data, 'utf8')
  24. }
  25. let result = ''
  26. for (let i = 0, n = data.byteLength; i < n; i++) {
  27. const byte = data[i]
  28. if (this.bytesNeeded === 0) {
  29. if (byte <= 0x7f) {
  30. result += String.fromCharCode(byte)
  31. } else {
  32. this.bytesSeen = 1
  33. if (byte >= 0xc2 && byte <= 0xdf) {
  34. this.bytesNeeded = 2
  35. this.codePoint = byte & 0x1f
  36. } else if (byte >= 0xe0 && byte <= 0xef) {
  37. if (byte === 0xe0) this.lowerBoundary = 0xa0
  38. else if (byte === 0xed) this.upperBoundary = 0x9f
  39. this.bytesNeeded = 3
  40. this.codePoint = byte & 0xf
  41. } else if (byte >= 0xf0 && byte <= 0xf4) {
  42. if (byte === 0xf0) this.lowerBoundary = 0x90
  43. if (byte === 0xf4) this.upperBoundary = 0x8f
  44. this.bytesNeeded = 4
  45. this.codePoint = byte & 0x7
  46. } else {
  47. result += '\ufffd'
  48. }
  49. }
  50. continue
  51. }
  52. if (byte < this.lowerBoundary || byte > this.upperBoundary) {
  53. this.codePoint = 0
  54. this.bytesNeeded = 0
  55. this.bytesSeen = 0
  56. this.lowerBoundary = 0x80
  57. this.upperBoundary = 0xbf
  58. result += '\ufffd'
  59. continue
  60. }
  61. this.lowerBoundary = 0x80
  62. this.upperBoundary = 0xbf
  63. this.codePoint = (this.codePoint << 6) | (byte & 0x3f)
  64. this.bytesSeen++
  65. if (this.bytesSeen !== this.bytesNeeded) continue
  66. result += String.fromCodePoint(this.codePoint)
  67. this.codePoint = 0
  68. this.bytesNeeded = 0
  69. this.bytesSeen = 0
  70. }
  71. return result
  72. }
  73. flush () {
  74. const result = this.bytesNeeded > 0 ? '\ufffd' : ''
  75. this.codePoint = 0
  76. this.bytesNeeded = 0
  77. this.bytesSeen = 0
  78. this.lowerBoundary = 0x80
  79. this.upperBoundary = 0xbf
  80. return result
  81. }
  82. }