decoder_spec.js 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. var should = require('should'),
  2. needle = require('./../'),
  3. decoder = require('./../lib/decoder'),
  4. Q = require('q'),
  5. chardet = require('jschardet'),
  6. fs = require('fs'),
  7. http = require('http'),
  8. helpers = require('./helpers');
  9. describe('character encoding', function() {
  10. this.timeout(5000);
  11. describe('Given content-type: "text/html; charset=EUC-JP"', function() {
  12. var port = 2233;
  13. var server;
  14. function createServer() {
  15. return http.createServer(function(req, res) {
  16. req.on('data', function(chunk) {})
  17. req.on('end', function() {
  18. // We used to pull from a particular site that is no longer up.
  19. // This is a local mirror pulled from archive.org
  20. // https://web.archive.org/web/20181003202907/http://www.nina.jp/server/slackware/webapp/tomcat_charset.html
  21. fs.readFile('test/tomcat_charset.html', function(err, data) {
  22. if (err) {
  23. res.writeHead(404);
  24. res.end(JSON.stringify(err));
  25. return;
  26. }
  27. res.writeHeader(200, { 'Content-Type': 'text/html; charset=EUC-JP' })
  28. res.end(data);
  29. });
  30. })
  31. })
  32. }
  33. before(function(done) {
  34. server = createServer();
  35. server.listen(port, done)
  36. url = 'http://localhost:' + port;
  37. })
  38. after(function(done) {
  39. server.close(done)
  40. })
  41. describe('with decode = false', function() {
  42. it('does not decode', function(done) {
  43. needle.get(url, { decode: false }, function(err, resp) {
  44. resp.body.should.be.a.String;
  45. chardet.detect(resp.body).encoding.should.eql('windows-1252');
  46. resp.body.indexOf('EUCを使う').should.eql(-1);
  47. done();
  48. })
  49. })
  50. })
  51. describe('with decode = true', function() {
  52. it('decodes', function(done) {
  53. needle.get(url, { decode: true }, function(err, resp) {
  54. resp.body.should.be.a.String;
  55. chardet.detect(resp.body).encoding.should.eql('ascii');
  56. resp.body.indexOf('EUCを使う').should.not.eql(-1);
  57. done();
  58. })
  59. })
  60. })
  61. })
  62. describe('Given content-type: "text/html but file is charset: gb2312', function() {
  63. it('encodes to UTF-8', function(done) {
  64. // Our Needle wrapper that requests a chinese website.
  65. var task = Q.nbind(needle.get, needle, 'http://www.chinesetop100.com/');
  66. // Different instantiations of this task
  67. var tasks = [Q.fcall(task, {decode: true}),
  68. Q.fcall(task, {decode: false})];
  69. var results = tasks.map(function(task) {
  70. return task.then(function(obj) {
  71. return obj[0].body;
  72. });
  73. });
  74. // Execute all requests concurrently
  75. Q.all(results).done(function(bodies) {
  76. var charsets = [
  77. chardet.detect(bodies[0]).encoding,
  78. chardet.detect(bodies[1]).encoding,
  79. ]
  80. // We wanted to decode our first stream as specified by options
  81. charsets[0].should.equal('ascii');
  82. bodies[0].indexOf('全球中文网站前二十强').should.not.equal(-1);
  83. // But not our second stream
  84. charsets[1].should.equal('windows-1252');
  85. bodies[1].indexOf('全球中文网站前二十强').should.equal(-1);
  86. done();
  87. });
  88. })
  89. })
  90. describe('Given content-type: "text/html"', function () {
  91. var server,
  92. port = 54321,
  93. text = 'Magyarországi Fióktelepe'
  94. before(function(done) {
  95. server = helpers.server({
  96. port: port,
  97. response: text,
  98. headers: { 'Content-Type': 'text/html' }
  99. }, done);
  100. })
  101. after(function(done) {
  102. server.close(done)
  103. })
  104. describe('with decode = false', function () {
  105. it('decodes by default to utf-8', function (done) {
  106. needle.get('http://localhost:' + port, { decode: false }, function (err, resp) {
  107. resp.body.should.be.a.String;
  108. chardet.detect(resp.body).encoding.should.eql('ISO-8859-2');
  109. resp.body.should.eql('Magyarországi Fióktelepe')
  110. done();
  111. })
  112. })
  113. })
  114. })
  115. describe('multibyte characters split across chunks', function () {
  116. describe('with encoding = utf-8', function() {
  117. var d,
  118. result = Buffer.allocUnsafe(0);
  119. before(function(done) {
  120. d = decoder('utf-8');
  121. done();
  122. });
  123. it('reassembles split multibyte characters', function (done) {
  124. d.on("data", function(chunk){
  125. result = Buffer.concat([ result, chunk ]);
  126. });
  127. d.on("end", function(){
  128. result.toString("utf-8").should.eql('慶');
  129. done();
  130. });
  131. // write '慶' in utf-8 split across chunks
  132. d.write(Buffer.from([0xE6]));
  133. d.write(Buffer.from([0x85]));
  134. d.write(Buffer.from([0xB6]));
  135. d.end();
  136. })
  137. })
  138. describe('with encoding = euc-jp', function() {
  139. var d,
  140. result = Buffer.allocUnsafe(0);
  141. before(function(done) {
  142. d = decoder('euc-jp');
  143. done();
  144. });
  145. it('reassembles split multibyte characters', function (done) {
  146. d.on("data", function(chunk){
  147. result = Buffer.concat([ result, chunk ]);
  148. });
  149. d.on("end", function(){
  150. result.toString("utf-8").should.eql('慶');
  151. done();
  152. });
  153. // write '慶' in euc-jp split across chunks
  154. d.write(Buffer.from([0xB7]));
  155. d.write(Buffer.from([0xC4]));
  156. d.end();
  157. })
  158. })
  159. describe('with encoding = gb18030', function() {
  160. var d,
  161. result = Buffer.allocUnsafe(0);
  162. before(function(done) {
  163. d = decoder('gb18030');
  164. done();
  165. });
  166. it('reassembles split multibyte characters', function (done) {
  167. d.on("data", function(chunk){
  168. result = Buffer.concat([ result, chunk ]);
  169. });
  170. d.on("end", function(){
  171. result.toString("utf-8").should.eql('慶');
  172. done();
  173. });
  174. // write '慶' in gb18030 split across chunks
  175. d.write(Buffer.from([0x91]));
  176. d.write(Buffer.from([0x63]));
  177. d.end();
  178. })
  179. })
  180. })
  181. })