downloadutils.js 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. /*
  2. * Copyright 2014 Mozilla Foundation
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /* eslint-disable no-var */
  17. "use strict";
  18. var fs = require("fs");
  19. var crypto = require("crypto");
  20. var http = require("http");
  21. var https = require("https");
  22. function rewriteWebArchiveUrl(url) {
  23. // Web Archive URLs need to be transformed to add `if_` after the ID.
  24. // Without this, an HTML page containing an iframe with the PDF file
  25. // will be served instead (issue 8920).
  26. var webArchiveRegex =
  27. /(^https?:\/\/web\.archive\.org\/web\/)(\d+)(\/https?:\/\/.+)/g;
  28. var urlParts = webArchiveRegex.exec(url);
  29. if (urlParts) {
  30. return urlParts[1] + (urlParts[2] + "if_") + urlParts[3];
  31. }
  32. return url;
  33. }
  34. function downloadFile(file, url, callback, redirects) {
  35. url = rewriteWebArchiveUrl(url);
  36. var protocol = /^https:\/\//.test(url) ? https : http;
  37. protocol
  38. .get(url, function (response) {
  39. if (
  40. response.statusCode === 301 ||
  41. response.statusCode === 302 ||
  42. response.statusCode === 307 ||
  43. response.statusCode === 308
  44. ) {
  45. if (redirects > 10) {
  46. callback("Too many redirects");
  47. }
  48. var redirectTo = response.headers.location;
  49. redirectTo = require("url").resolve(url, redirectTo);
  50. downloadFile(file, redirectTo, callback, (redirects || 0) + 1);
  51. return;
  52. }
  53. if (response.statusCode !== 200) {
  54. callback("HTTP " + response.statusCode);
  55. return;
  56. }
  57. var stream = fs.createWriteStream(file);
  58. stream.on("error", function (err) {
  59. callback(err);
  60. });
  61. response.pipe(stream);
  62. stream.on("finish", function () {
  63. stream.end();
  64. callback();
  65. });
  66. })
  67. .on("error", function (err) {
  68. callback(err);
  69. });
  70. }
  71. function downloadManifestFiles(manifest, callback) {
  72. function downloadNext() {
  73. if (i >= links.length) {
  74. callback();
  75. return;
  76. }
  77. var file = links[i].file;
  78. var url = links[i].url;
  79. console.log("Downloading " + url + " to " + file + "...");
  80. downloadFile(file, url, function (err) {
  81. if (err) {
  82. console.error("Error during downloading of " + url + ": " + err);
  83. fs.writeFileSync(file, ""); // making it empty file
  84. fs.writeFileSync(file + ".error", err);
  85. }
  86. i++;
  87. downloadNext();
  88. });
  89. }
  90. var links = manifest
  91. .filter(function (item) {
  92. return item.link && !fs.existsSync(item.file);
  93. })
  94. .map(function (item) {
  95. var file = item.file;
  96. var linkfile = file + ".link";
  97. var url = fs.readFileSync(linkfile).toString();
  98. url = url.replace(/\s+$/, "");
  99. return { file, url };
  100. });
  101. var i = 0;
  102. downloadNext();
  103. }
  104. function calculateMD5(file, callback) {
  105. var hash = crypto.createHash("md5");
  106. var stream = fs.createReadStream(file);
  107. stream.on("data", function (data) {
  108. hash.update(data);
  109. });
  110. stream.on("error", function (err) {
  111. callback(err);
  112. });
  113. stream.on("end", function () {
  114. var result = hash.digest("hex");
  115. callback(null, result);
  116. });
  117. }
  118. function verifyManifestFiles(manifest, callback) {
  119. function verifyNext() {
  120. if (i >= manifest.length) {
  121. callback(error);
  122. return;
  123. }
  124. var item = manifest[i];
  125. if (fs.existsSync(item.file + ".error")) {
  126. console.error(
  127. 'WARNING: File was not downloaded. See "' + item.file + '.error" file.'
  128. );
  129. error = true;
  130. i++;
  131. verifyNext();
  132. return;
  133. }
  134. if (item.link && !fs.existsSync(item.file + ".link")) {
  135. console.error(
  136. `WARNING: Unneeded \`"link": true\`-entry for the "${item.id}" test.`
  137. );
  138. error = true;
  139. i++;
  140. verifyNext();
  141. return;
  142. }
  143. calculateMD5(item.file, function (err, md5) {
  144. if (err) {
  145. console.log('WARNING: Unable to open file for reading "' + err + '".');
  146. error = true;
  147. } else if (!item.md5) {
  148. console.error(
  149. 'WARNING: Missing md5 for file "' +
  150. item.file +
  151. '". ' +
  152. 'Hash for current file is "' +
  153. md5 +
  154. '"'
  155. );
  156. error = true;
  157. } else if (md5 !== item.md5) {
  158. console.error(
  159. 'WARNING: MD5 of file "' +
  160. item.file +
  161. '" does not match file. Expected "' +
  162. item.md5 +
  163. '" computed "' +
  164. md5 +
  165. '"'
  166. );
  167. error = true;
  168. }
  169. i++;
  170. verifyNext();
  171. });
  172. }
  173. var i = 0;
  174. var error = false;
  175. verifyNext();
  176. }
  177. exports.downloadManifestFiles = downloadManifestFiles;
  178. exports.verifyManifestFiles = verifyManifestFiles;