csv.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. // Copyright 2012 The Closure Library Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS-IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. /**
  15. * @fileoverview Provides a parser that turns a string of well-formed CSV data
  16. * into an array of objects or an array of arrays. All values are returned as
  17. * strings; the user has to convert data into numbers or Dates as required.
  18. * Empty fields (adjacent commas) are returned as empty strings.
  19. *
  20. * This parser uses http://tools.ietf.org/html/rfc4180 as the definition of CSV.
  21. *
  22. * @author nnaze@google.com (Nathan Naze) Ported to Closure
  23. */
  24. goog.provide('goog.labs.format.csv');
  25. goog.provide('goog.labs.format.csv.ParseError');
  26. goog.provide('goog.labs.format.csv.Token');
  27. goog.require('goog.array');
  28. goog.require('goog.asserts');
  29. goog.require('goog.debug.Error');
  30. goog.require('goog.object');
  31. goog.require('goog.string');
  32. goog.require('goog.string.newlines');
  33. /**
  34. * @define {boolean} Enable verbose debugging. This is a flag so it can be
  35. * enabled in production if necessary post-compilation. Otherwise, debug
  36. * information will be stripped to minimize final code size.
  37. */
  38. goog.labs.format.csv.ENABLE_VERBOSE_DEBUGGING = goog.DEBUG;
  39. /**
  40. * Error thrown when parsing fails.
  41. *
  42. * @param {string} text The CSV source text being parsed.
  43. * @param {number} index The index, in the string, of the position of the
  44. * error.
  45. * @param {string=} opt_message A description of the violated parse expectation.
  46. * @constructor
  47. * @extends {goog.debug.Error}
  48. * @final
  49. */
  50. goog.labs.format.csv.ParseError = function(text, index, opt_message) {
  51. var message;
  52. /**
  53. * @type {?{line: number, column: number}} The line and column of the parse
  54. * error.
  55. */
  56. this.position = null;
  57. if (goog.labs.format.csv.ENABLE_VERBOSE_DEBUGGING) {
  58. message = opt_message || '';
  59. var info = goog.labs.format.csv.ParseError.findLineInfo_(text, index);
  60. if (info) {
  61. var lineNumber = info.lineIndex + 1;
  62. var columnNumber = index - info.line.startLineIndex + 1;
  63. this.position = {line: lineNumber, column: columnNumber};
  64. message +=
  65. goog.string.subs(' at line %s column %s', lineNumber, columnNumber);
  66. message += '\n' +
  67. goog.labs.format.csv.ParseError.getLineDebugString_(
  68. info.line.getContent(), columnNumber);
  69. }
  70. }
  71. goog.labs.format.csv.ParseError.base(this, 'constructor', message);
  72. };
  73. goog.inherits(goog.labs.format.csv.ParseError, goog.debug.Error);
  74. /** @inheritDoc */
  75. goog.labs.format.csv.ParseError.prototype.name = 'ParseError';
  76. /**
  77. * Calculate the line and column for an index in a string.
  78. * TODO(nnaze): Consider moving to goog.string.newlines.
  79. * @param {string} str A string.
  80. * @param {number} index An index into the string.
  81. * @return {?{line: !goog.string.newlines.Line, lineIndex: number}} The line
  82. * and index of the line.
  83. * @private
  84. */
  85. goog.labs.format.csv.ParseError.findLineInfo_ = function(str, index) {
  86. var lines = goog.string.newlines.getLines(str);
  87. var lineIndex = goog.array.findIndex(lines, function(line) {
  88. return line.startLineIndex <= index && line.endLineIndex > index;
  89. });
  90. if (goog.isNumber(lineIndex)) {
  91. var line = lines[lineIndex];
  92. return {line: line, lineIndex: lineIndex};
  93. }
  94. return null;
  95. };
  96. /**
  97. * Get a debug string of a line and a pointing caret beneath it.
  98. * @param {string} str The string.
  99. * @param {number} column The column to point at (1-indexed).
  100. * @return {string} The debug line.
  101. * @private
  102. */
  103. goog.labs.format.csv.ParseError.getLineDebugString_ = function(str, column) {
  104. var returnString = str + '\n';
  105. returnString += goog.string.repeat(' ', column - 1) + '^';
  106. return returnString;
  107. };
  108. /**
  109. * A token -- a single-character string or a sentinel.
  110. * @typedef {string|!goog.labs.format.csv.Sentinels_}
  111. */
  112. goog.labs.format.csv.Token;
  113. /**
  114. * Parses a CSV string to create a two-dimensional array.
  115. *
  116. * This function does not process header lines, etc -- such transformations can
  117. * be made on the resulting array.
  118. *
  119. * @param {string} text The entire CSV text to be parsed.
  120. * @param {boolean=} opt_ignoreErrors Whether to ignore parsing errors and
  121. * instead try to recover and keep going.
  122. * @param {string=} opt_delimiter The delimiter to use. Defaults to ','
  123. * @return {!Array<!Array<string>>} The parsed CSV.
  124. */
  125. goog.labs.format.csv.parse = function(text, opt_ignoreErrors, opt_delimiter) {
  126. var index = 0; // current char offset being considered
  127. var delimiter = opt_delimiter || ',';
  128. goog.asserts.assert(
  129. delimiter.length == 1, 'Delimiter must be a single character.');
  130. goog.asserts.assert(
  131. delimiter != '\r' && opt_delimiter != '\n',
  132. 'Cannot use newline or carriage return has delimiter.');
  133. var EOF = goog.labs.format.csv.Sentinels_.EOF;
  134. var EOR = goog.labs.format.csv.Sentinels_.EOR;
  135. var NEWLINE = goog.labs.format.csv.Sentinels_.NEWLINE; // \r?\n
  136. var EMPTY = goog.labs.format.csv.Sentinels_.EMPTY;
  137. var pushBackToken = null; // A single-token pushback.
  138. var sawComma = false; // Special case for terminal comma.
  139. /**
  140. * Push a single token into the push-back variable.
  141. * @param {goog.labs.format.csv.Token} t Single token.
  142. */
  143. function pushBack(t) {
  144. goog.labs.format.csv.assertToken_(t);
  145. goog.asserts.assert(goog.isNull(pushBackToken));
  146. pushBackToken = t;
  147. }
  148. /**
  149. * @return {goog.labs.format.csv.Token} The next token in the stream.
  150. */
  151. function nextToken() {
  152. // Give the push back token if present.
  153. if (pushBackToken != null) {
  154. var c = pushBackToken;
  155. pushBackToken = null;
  156. return c;
  157. }
  158. // We're done. EOF.
  159. if (index >= text.length) {
  160. return EOF;
  161. }
  162. // Give the next charater.
  163. var chr = text.charAt(index++);
  164. goog.labs.format.csv.assertToken_(chr);
  165. // Check if this is a newline. If so, give the new line sentinel.
  166. var isNewline = false;
  167. if (chr == '\n') {
  168. isNewline = true;
  169. } else if (chr == '\r') {
  170. // This is a '\r\n' newline. Treat as single token, go
  171. // forward two indicies.
  172. if (index < text.length && text.charAt(index) == '\n') {
  173. index++;
  174. }
  175. isNewline = true;
  176. }
  177. if (isNewline) {
  178. return NEWLINE;
  179. }
  180. return chr;
  181. }
  182. /**
  183. * Read a quoted field from input.
  184. * @return {string} The field, as a string.
  185. */
  186. function readQuotedField() {
  187. // We've already consumed the first quote by the time we get here.
  188. var start = index;
  189. var end = null;
  190. for (var token = nextToken(); token != EOF; token = nextToken()) {
  191. if (token == '"') {
  192. end = index - 1;
  193. token = nextToken();
  194. // Two double quotes in a row. Keep scanning.
  195. if (token == '"') {
  196. end = null;
  197. continue;
  198. }
  199. // End of field. Break out.
  200. if (token == delimiter || token == EOF || token == NEWLINE) {
  201. if (token == NEWLINE) {
  202. pushBack(token);
  203. }
  204. break;
  205. }
  206. if (!opt_ignoreErrors) {
  207. // Ignoring errors here means keep going in current field after
  208. // closing quote. E.g. "ab"c,d splits into abc,d
  209. throw new goog.labs.format.csv.ParseError(
  210. text, index - 1,
  211. 'Unexpected character "' + token + '" after quote mark');
  212. } else {
  213. // Fall back to reading the rest of this field as unquoted.
  214. // Note: the rest is guaranteed not start with ", as that case is
  215. // eliminated above.
  216. var prefix = '"' + text.substring(start, index);
  217. var suffix = readField();
  218. if (suffix == EOR) {
  219. pushBack(NEWLINE);
  220. return prefix;
  221. } else {
  222. return prefix + suffix;
  223. }
  224. }
  225. }
  226. }
  227. if (goog.isNull(end)) {
  228. if (!opt_ignoreErrors) {
  229. throw new goog.labs.format.csv.ParseError(
  230. text, text.length - 1, 'Unexpected end of text after open quote');
  231. } else {
  232. end = text.length;
  233. }
  234. }
  235. // Take substring, combine double quotes.
  236. return text.substring(start, end).replace(/""/g, '"');
  237. }
  238. /**
  239. * Read a field from input.
  240. * @return {string|!goog.labs.format.csv.Sentinels_} The field, as a string,
  241. * or a sentinel (if applicable).
  242. */
  243. function readField() {
  244. var start = index;
  245. var didSeeComma = sawComma;
  246. sawComma = false;
  247. var token = nextToken();
  248. if (token == EMPTY) {
  249. return EOR;
  250. }
  251. if (token == EOF || token == NEWLINE) {
  252. if (didSeeComma) {
  253. pushBack(EMPTY);
  254. return '';
  255. }
  256. return EOR;
  257. }
  258. // This is the beginning of a quoted field.
  259. if (token == '"') {
  260. return readQuotedField();
  261. }
  262. while (true) {
  263. // This is the end of line or file.
  264. if (token == EOF || token == NEWLINE) {
  265. pushBack(token);
  266. break;
  267. }
  268. // This is the end of record.
  269. if (token == delimiter) {
  270. sawComma = true;
  271. break;
  272. }
  273. if (token == '"' && !opt_ignoreErrors) {
  274. throw new goog.labs.format.csv.ParseError(
  275. text, index - 1, 'Unexpected quote mark');
  276. }
  277. token = nextToken();
  278. }
  279. var returnString = (token == EOF) ?
  280. text.substring(start) : // Return to end of file.
  281. text.substring(start, index - 1);
  282. return returnString.replace(/[\r\n]+/g, ''); // Squash any CRLFs.
  283. }
  284. /**
  285. * Read the next record.
  286. * @return {!Array<string>|!goog.labs.format.csv.Sentinels_} A single record
  287. * with multiple fields.
  288. */
  289. function readRecord() {
  290. if (index >= text.length) {
  291. return EOF;
  292. }
  293. var record = [];
  294. for (var field = readField(); field != EOR; field = readField()) {
  295. record.push(field);
  296. }
  297. return record;
  298. }
  299. // Read all records and return.
  300. var records = [];
  301. for (var record = readRecord(); record != EOF; record = readRecord()) {
  302. records.push(record);
  303. }
  304. return records;
  305. };
  306. /**
  307. * Sentinel tracking objects.
  308. * @enum {!Object}
  309. * @private
  310. */
  311. goog.labs.format.csv.Sentinels_ = {
  312. /** Empty field */
  313. EMPTY: {},
  314. /** End of file */
  315. EOF: {},
  316. /** End of record */
  317. EOR: {},
  318. /** Newline. \r?\n */
  319. NEWLINE: {}
  320. };
  321. /**
  322. * @param {string} str A string.
  323. * @return {boolean} Whether the string is a single character.
  324. * @private
  325. */
  326. goog.labs.format.csv.isCharacterString_ = function(str) {
  327. return goog.isString(str) && str.length == 1;
  328. };
  329. /**
  330. * Assert the parameter is a token.
  331. * @param {*} o What should be a token.
  332. * @throws {goog.asserts.AssertionError} If {@ code} is not a token.
  333. * @private
  334. */
  335. goog.labs.format.csv.assertToken_ = function(o) {
  336. if (goog.isString(o)) {
  337. goog.asserts.assertString(o);
  338. goog.asserts.assert(
  339. goog.labs.format.csv.isCharacterString_(o),
  340. 'Should be a string of length 1 or a sentinel.');
  341. } else {
  342. goog.asserts.assert(
  343. goog.object.containsValue(goog.labs.format.csv.Sentinels_, o),
  344. 'Should be a string of length 1 or a sentinel.');
  345. }
  346. };