Meki Cherkaoui | 97e7a59 | 2012-04-14 02:50:06 -0700 | [diff] [blame^] | 1 | // wrapper for non-node envs |
| 2 | ;(function (sax) { |
| 3 | |
| 4 | sax.parser = function (strict, opt) { return new SAXParser(strict, opt) } |
| 5 | sax.SAXParser = SAXParser |
| 6 | sax.SAXStream = SAXStream |
| 7 | sax.createStream = createStream |
| 8 | |
| 9 | // When we pass the MAX_BUFFER_LENGTH position, start checking for buffer overruns. |
| 10 | // When we check, schedule the next check for MAX_BUFFER_LENGTH - (max(buffer lengths)), |
| 11 | // since that's the earliest that a buffer overrun could occur. This way, checks are |
| 12 | // as rare as required, but as often as necessary to ensure never crossing this bound. |
| 13 | // Furthermore, buffers are only tested at most once per write(), so passing a very |
| 14 | // large string into write() might have undesirable effects, but this is manageable by |
| 15 | // the caller, so it is assumed to be safe. Thus, a call to write() may, in the extreme |
| 16 | // edge case, result in creating at most one complete copy of the string passed in. |
| 17 | // Set to Infinity to have unlimited buffers. |
| 18 | sax.MAX_BUFFER_LENGTH = 64 * 1024 |
| 19 | |
| 20 | var buffers = [ |
| 21 | "comment", "sgmlDecl", "textNode", "tagName", "doctype", |
| 22 | "procInstName", "procInstBody", "entity", "attribName", |
| 23 | "attribValue", "cdata", "script" |
| 24 | ] |
| 25 | |
| 26 | sax.EVENTS = // for discoverability. |
| 27 | [ "text" |
| 28 | , "processinginstruction" |
| 29 | , "sgmldeclaration" |
| 30 | , "doctype" |
| 31 | , "comment" |
| 32 | , "attribute" |
| 33 | , "opentag" |
| 34 | , "closetag" |
| 35 | , "opencdata" |
| 36 | , "cdata" |
| 37 | , "closecdata" |
| 38 | , "error" |
| 39 | , "end" |
| 40 | , "ready" |
| 41 | , "script" |
| 42 | , "opennamespace" |
| 43 | , "closenamespace" |
| 44 | ] |
| 45 | |
| 46 | function SAXParser (strict, opt) { |
| 47 | if (!(this instanceof SAXParser)) return new SAXParser(strict, opt) |
| 48 | |
| 49 | var parser = this |
| 50 | clearBuffers(parser) |
| 51 | parser.q = parser.c = "" |
| 52 | parser.bufferCheckPosition = sax.MAX_BUFFER_LENGTH |
| 53 | parser.opt = opt || {} |
| 54 | parser.tagCase = parser.opt.lowercasetags ? "toLowerCase" : "toUpperCase" |
| 55 | parser.tags = [] |
| 56 | parser.closed = parser.closedRoot = parser.sawRoot = false |
| 57 | parser.tag = parser.error = null |
| 58 | parser.strict = !!strict |
| 59 | parser.noscript = !!(strict || parser.opt.noscript) |
| 60 | parser.state = S.BEGIN |
| 61 | parser.ENTITIES = Object.create(sax.ENTITIES) |
| 62 | parser.attribList = [] |
| 63 | |
| 64 | // namespaces form a prototype chain. |
| 65 | // it always points at the current tag, |
| 66 | // which protos to its parent tag. |
| 67 | if (parser.opt.xmlns) parser.ns = Object.create(rootNS) |
| 68 | |
| 69 | // mostly just for error reporting |
| 70 | parser.position = parser.line = parser.column = 0 |
| 71 | emit(parser, "onready") |
| 72 | } |
| 73 | |
| 74 | if (!Object.create) Object.create = function (o) { |
| 75 | function f () { this.__proto__ = o } |
| 76 | f.prototype = o |
| 77 | return new f |
| 78 | } |
| 79 | |
| 80 | if (!Object.getPrototypeOf) Object.getPrototypeOf = function (o) { |
| 81 | return o.__proto__ |
| 82 | } |
| 83 | |
| 84 | if (!Object.keys) Object.keys = function (o) { |
| 85 | var a = [] |
| 86 | for (var i in o) if (o.hasOwnProperty(i)) a.push(i) |
| 87 | return a |
| 88 | } |
| 89 | |
| 90 | function checkBufferLength (parser) { |
| 91 | var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10) |
| 92 | , maxActual = 0 |
| 93 | for (var i = 0, l = buffers.length; i < l; i ++) { |
| 94 | var len = parser[buffers[i]].length |
| 95 | if (len > maxAllowed) { |
| 96 | // Text/cdata nodes can get big, and since they're buffered, |
| 97 | // we can get here under normal conditions. |
| 98 | // Avoid issues by emitting the text node now, |
| 99 | // so at least it won't get any bigger. |
| 100 | switch (buffers[i]) { |
| 101 | case "textNode": |
| 102 | closeText(parser) |
| 103 | break |
| 104 | |
| 105 | case "cdata": |
| 106 | emitNode(parser, "oncdata", parser.cdata) |
| 107 | parser.cdata = "" |
| 108 | break |
| 109 | |
| 110 | case "script": |
| 111 | emitNode(parser, "onscript", parser.script) |
| 112 | parser.script = "" |
| 113 | break |
| 114 | |
| 115 | default: |
| 116 | error(parser, "Max buffer length exceeded: "+buffers[i]) |
| 117 | } |
| 118 | } |
| 119 | maxActual = Math.max(maxActual, len) |
| 120 | } |
| 121 | // schedule the next check for the earliest possible buffer overrun. |
| 122 | parser.bufferCheckPosition = (sax.MAX_BUFFER_LENGTH - maxActual) |
| 123 | + parser.position |
| 124 | } |
| 125 | |
| 126 | function clearBuffers (parser) { |
| 127 | for (var i = 0, l = buffers.length; i < l; i ++) { |
| 128 | parser[buffers[i]] = "" |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | SAXParser.prototype = |
| 133 | { end: function () { end(this) } |
| 134 | , write: write |
| 135 | , resume: function () { this.error = null; return this } |
| 136 | , close: function () { return this.write(null) } |
| 137 | } |
| 138 | |
| 139 | try { |
| 140 | var Stream = require("stream").Stream |
| 141 | } catch (ex) { |
| 142 | var Stream = function () {} |
| 143 | } |
| 144 | |
| 145 | |
| 146 | var streamWraps = sax.EVENTS.filter(function (ev) { |
| 147 | return ev !== "error" && ev !== "end" |
| 148 | }) |
| 149 | |
| 150 | function createStream (strict, opt) { |
| 151 | return new SAXStream(strict, opt) |
| 152 | } |
| 153 | |
| 154 | function SAXStream (strict, opt) { |
| 155 | if (!(this instanceof SAXStream)) return new SAXStream(strict, opt) |
| 156 | |
| 157 | Stream.apply(me) |
| 158 | |
| 159 | this._parser = new SAXParser(strict, opt) |
| 160 | this.writable = true |
| 161 | this.readable = true |
| 162 | |
| 163 | |
| 164 | var me = this |
| 165 | |
| 166 | this._parser.onend = function () { |
| 167 | me.emit("end") |
| 168 | } |
| 169 | |
| 170 | this._parser.onerror = function (er) { |
| 171 | me.emit("error", er) |
| 172 | |
| 173 | // if didn't throw, then means error was handled. |
| 174 | // go ahead and clear error, so we can write again. |
| 175 | me._parser.error = null |
| 176 | } |
| 177 | |
| 178 | streamWraps.forEach(function (ev) { |
| 179 | Object.defineProperty(me, "on" + ev, { |
| 180 | get: function () { return me._parser["on" + ev] }, |
| 181 | set: function (h) { |
| 182 | if (!h) { |
| 183 | me.removeAllListeners(ev) |
| 184 | return me._parser["on"+ev] = h |
| 185 | } |
| 186 | me.on(ev, h) |
| 187 | }, |
| 188 | enumerable: true, |
| 189 | configurable: false |
| 190 | }) |
| 191 | }) |
| 192 | } |
| 193 | |
| 194 | SAXStream.prototype = Object.create(Stream.prototype, |
| 195 | { constructor: { value: SAXStream } }) |
| 196 | |
| 197 | SAXStream.prototype.write = function (data) { |
| 198 | this._parser.write(data.toString()) |
| 199 | this.emit("data", data) |
| 200 | return true |
| 201 | } |
| 202 | |
| 203 | SAXStream.prototype.end = function (chunk) { |
| 204 | if (chunk && chunk.length) this._parser.write(chunk.toString()) |
| 205 | this._parser.end() |
| 206 | return true |
| 207 | } |
| 208 | |
| 209 | SAXStream.prototype.on = function (ev, handler) { |
| 210 | var me = this |
| 211 | if (!me._parser["on"+ev] && streamWraps.indexOf(ev) !== -1) { |
| 212 | me._parser["on"+ev] = function () { |
| 213 | var args = arguments.length === 1 ? [arguments[0]] |
| 214 | : Array.apply(null, arguments) |
| 215 | args.splice(0, 0, ev) |
| 216 | me.emit.apply(me, args) |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | return Stream.prototype.on.call(me, ev, handler) |
| 221 | } |
| 222 | |
| 223 | |
| 224 | |
| 225 | // character classes and tokens |
| 226 | var whitespace = "\r\n\t " |
| 227 | // this really needs to be replaced with character classes. |
| 228 | // XML allows all manner of ridiculous numbers and digits. |
| 229 | , number = "0124356789" |
| 230 | , letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 231 | // (Letter | "_" | ":") |
| 232 | , nameStart = letter+"_:" |
| 233 | , nameBody = nameStart+number+"-." |
| 234 | , quote = "'\"" |
| 235 | , entity = number+letter+"#" |
| 236 | , attribEnd = whitespace + ">" |
| 237 | , CDATA = "[CDATA[" |
| 238 | , DOCTYPE = "DOCTYPE" |
| 239 | , XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" |
| 240 | , XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/" |
| 241 | , rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE } |
| 242 | |
| 243 | // turn all the string character sets into character class objects. |
| 244 | whitespace = charClass(whitespace) |
| 245 | number = charClass(number) |
| 246 | letter = charClass(letter) |
| 247 | nameStart = charClass(nameStart) |
| 248 | nameBody = charClass(nameBody) |
| 249 | quote = charClass(quote) |
| 250 | entity = charClass(entity) |
| 251 | attribEnd = charClass(attribEnd) |
| 252 | |
| 253 | function charClass (str) { |
| 254 | return str.split("").reduce(function (s, c) { |
| 255 | s[c] = true |
| 256 | return s |
| 257 | }, {}) |
| 258 | } |
| 259 | |
| 260 | function is (charclass, c) { |
| 261 | return charclass[c] |
| 262 | } |
| 263 | |
| 264 | function not (charclass, c) { |
| 265 | return !charclass[c] |
| 266 | } |
| 267 | |
| 268 | var S = 0 |
| 269 | sax.STATE = |
| 270 | { BEGIN : S++ |
| 271 | , TEXT : S++ // general stuff |
| 272 | , TEXT_ENTITY : S++ // & and such. |
| 273 | , OPEN_WAKA : S++ // < |
| 274 | , SGML_DECL : S++ // <!BLARG |
| 275 | , SGML_DECL_QUOTED : S++ // <!BLARG foo "bar |
| 276 | , DOCTYPE : S++ // <!DOCTYPE |
| 277 | , DOCTYPE_QUOTED : S++ // <!DOCTYPE "//blah |
| 278 | , DOCTYPE_DTD : S++ // <!DOCTYPE "//blah" [ ... |
| 279 | , DOCTYPE_DTD_QUOTED : S++ // <!DOCTYPE "//blah" [ "foo |
| 280 | , COMMENT_STARTING : S++ // <!- |
| 281 | , COMMENT : S++ // <!-- |
| 282 | , COMMENT_ENDING : S++ // <!-- blah - |
| 283 | , COMMENT_ENDED : S++ // <!-- blah -- |
| 284 | , CDATA : S++ // <![CDATA[ something |
| 285 | , CDATA_ENDING : S++ // ] |
| 286 | , CDATA_ENDING_2 : S++ // ]] |
| 287 | , PROC_INST : S++ // <?hi |
| 288 | , PROC_INST_BODY : S++ // <?hi there |
| 289 | , PROC_INST_QUOTED : S++ // <?hi "there |
| 290 | , PROC_INST_ENDING : S++ // <?hi "there" ? |
| 291 | , OPEN_TAG : S++ // <strong |
| 292 | , OPEN_TAG_SLASH : S++ // <strong / |
| 293 | , ATTRIB : S++ // <a |
| 294 | , ATTRIB_NAME : S++ // <a foo |
| 295 | , ATTRIB_NAME_SAW_WHITE : S++ // <a foo _ |
| 296 | , ATTRIB_VALUE : S++ // <a foo= |
| 297 | , ATTRIB_VALUE_QUOTED : S++ // <a foo="bar |
| 298 | , ATTRIB_VALUE_UNQUOTED : S++ // <a foo=bar |
| 299 | , ATTRIB_VALUE_ENTITY_Q : S++ // <foo bar=""" |
| 300 | , ATTRIB_VALUE_ENTITY_U : S++ // <foo bar=" |
| 301 | , CLOSE_TAG : S++ // </a |
| 302 | , CLOSE_TAG_SAW_WHITE : S++ // </a > |
| 303 | , SCRIPT : S++ // <script> ... |
| 304 | , SCRIPT_ENDING : S++ // <script> ... < |
| 305 | } |
| 306 | |
| 307 | sax.ENTITIES = |
| 308 | { "apos" : "'" |
| 309 | , "quot" : "\"" |
| 310 | , "amp" : "&" |
| 311 | , "gt" : ">" |
| 312 | , "lt" : "<" |
| 313 | } |
| 314 | |
| 315 | for (var S in sax.STATE) sax.STATE[sax.STATE[S]] = S |
| 316 | |
| 317 | // shorthand |
| 318 | S = sax.STATE |
| 319 | |
| 320 | function emit (parser, event, data) { |
| 321 | parser[event] && parser[event](data) |
| 322 | } |
| 323 | |
| 324 | function emitNode (parser, nodeType, data) { |
| 325 | if (parser.textNode) closeText(parser) |
| 326 | emit(parser, nodeType, data) |
| 327 | } |
| 328 | |
| 329 | function closeText (parser) { |
| 330 | parser.textNode = textopts(parser.opt, parser.textNode) |
| 331 | if (parser.textNode) emit(parser, "ontext", parser.textNode) |
| 332 | parser.textNode = "" |
| 333 | } |
| 334 | |
| 335 | function textopts (opt, text) { |
| 336 | if (opt.trim) text = text.trim() |
| 337 | if (opt.normalize) text = text.replace(/\s+/g, " ") |
| 338 | return text |
| 339 | } |
| 340 | |
| 341 | function error (parser, er) { |
| 342 | closeText(parser) |
| 343 | er += "\nLine: "+parser.line+ |
| 344 | "\nColumn: "+parser.column+ |
| 345 | "\nChar: "+parser.c |
| 346 | er = new Error(er) |
| 347 | parser.error = er |
| 348 | emit(parser, "onerror", er) |
| 349 | return parser |
| 350 | } |
| 351 | |
| 352 | function end (parser) { |
| 353 | if (parser.state !== S.TEXT) error(parser, "Unexpected end") |
| 354 | closeText(parser) |
| 355 | parser.c = "" |
| 356 | parser.closed = true |
| 357 | emit(parser, "onend") |
| 358 | SAXParser.call(parser, parser.strict, parser.opt) |
| 359 | return parser |
| 360 | } |
| 361 | |
| 362 | function strictFail (parser, message) { |
| 363 | if (parser.strict) error(parser, message) |
| 364 | } |
| 365 | |
| 366 | function newTag (parser) { |
| 367 | if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase]() |
| 368 | var parent = parser.tags[parser.tags.length - 1] || parser |
| 369 | , tag = parser.tag = { name : parser.tagName, attributes : {} } |
| 370 | |
| 371 | // will be overridden if tag contails an xmlns="foo" or xmlns:foo="bar" |
| 372 | if (parser.opt.xmlns) tag.ns = parent.ns |
| 373 | parser.attribList.length = 0 |
| 374 | } |
| 375 | |
| 376 | function qname (name) { |
| 377 | var i = name.indexOf(":") |
| 378 | , qualName = i < 0 ? [ "", name ] : name.split(":") |
| 379 | , prefix = qualName[0] |
| 380 | , local = qualName[1] |
| 381 | |
| 382 | // <x "xmlns"="http://foo"> |
| 383 | if (name === "xmlns") { |
| 384 | prefix = "xmlns" |
| 385 | local = "" |
| 386 | } |
| 387 | |
| 388 | return { prefix: prefix, local: local } |
| 389 | } |
| 390 | |
| 391 | function attrib (parser) { |
| 392 | if (parser.opt.xmlns) { |
| 393 | var qn = qname(parser.attribName) |
| 394 | , prefix = qn.prefix |
| 395 | , local = qn.local |
| 396 | |
| 397 | if (prefix === "xmlns") { |
| 398 | // namespace binding attribute; push the binding into scope |
| 399 | if (local === "xml" && parser.attribValue !== XML_NAMESPACE) { |
| 400 | strictFail( parser |
| 401 | , "xml: prefix must be bound to " + XML_NAMESPACE + "\n" |
| 402 | + "Actual: " + parser.attribValue ) |
| 403 | } else if (local === "xmlns" && parser.attribValue !== XMLNS_NAMESPACE) { |
| 404 | strictFail( parser |
| 405 | , "xmlns: prefix must be bound to " + XMLNS_NAMESPACE + "\n" |
| 406 | + "Actual: " + parser.attribValue ) |
| 407 | } else { |
| 408 | var tag = parser.tag |
| 409 | , parent = parser.tags[parser.tags.length - 1] || parser |
| 410 | if (tag.ns === parent.ns) { |
| 411 | tag.ns = Object.create(parent.ns) |
| 412 | } |
| 413 | tag.ns[local] = parser.attribValue |
| 414 | } |
| 415 | } |
| 416 | |
| 417 | // defer onattribute events until all attributes have been seen |
| 418 | // so any new bindings can take effect; preserve attribute order |
| 419 | // so deferred events can be emitted in document order |
| 420 | parser.attribList.push([parser.attribName, parser.attribValue]) |
| 421 | } else { |
| 422 | // in non-xmlns mode, we can emit the event right away |
| 423 | parser.tag.attributes[parser.attribName] = parser.attribValue |
| 424 | emitNode( parser |
| 425 | , "onattribute" |
| 426 | , { name: parser.attribName |
| 427 | , value: parser.attribValue } ) |
| 428 | } |
| 429 | |
| 430 | parser.attribName = parser.attribValue = "" |
| 431 | } |
| 432 | |
| 433 | function openTag (parser, selfClosing) { |
| 434 | if (parser.opt.xmlns) { |
| 435 | // emit namespace binding events |
| 436 | var tag = parser.tag |
| 437 | |
| 438 | // add namespace info to tag |
| 439 | var qn = qname(parser.tagName) |
| 440 | tag.prefix = qn.prefix |
| 441 | tag.local = qn.local |
| 442 | tag.uri = tag.ns[qn.prefix] || qn.prefix |
| 443 | |
| 444 | if (tag.prefix && !tag.uri) { |
| 445 | strictFail(parser, "Unbound namespace prefix: " |
| 446 | + JSON.stringify(parser.tagName)) |
| 447 | } |
| 448 | |
| 449 | var parent = parser.tags[parser.tags.length - 1] || parser |
| 450 | if (tag.ns && parent.ns !== tag.ns) { |
| 451 | Object.keys(tag.ns).forEach(function (p) { |
| 452 | emitNode( parser |
| 453 | , "onopennamespace" |
| 454 | , { prefix: p , uri: tag.ns[p] } ) |
| 455 | }) |
| 456 | } |
| 457 | |
| 458 | // handle deferred onattribute events |
| 459 | for (var i = 0, l = parser.attribList.length; i < l; i ++) { |
| 460 | var nv = parser.attribList[i] |
| 461 | var name = nv[0] |
| 462 | , value = nv[1] |
| 463 | , qualName = qname(name) |
| 464 | , prefix = qualName.prefix |
| 465 | , local = qualName.local |
| 466 | , uri = tag.ns[prefix] || "" |
| 467 | , a = { name: name |
| 468 | , value: value |
| 469 | , prefix: prefix |
| 470 | , local: local |
| 471 | , uri: uri |
| 472 | } |
| 473 | |
| 474 | // if there's any attributes with an undefined namespace, |
| 475 | // then fail on them now. |
| 476 | if (prefix && prefix != "xmlns" && !uri) { |
| 477 | strictFail(parser, "Unbound namespace prefix: " |
| 478 | + JSON.stringify(prefix)) |
| 479 | a.uri = prefix |
| 480 | } |
| 481 | parser.tag.attributes[name] = a |
| 482 | emitNode(parser, "onattribute", a) |
| 483 | } |
| 484 | parser.attribList.length = 0 |
| 485 | } |
| 486 | |
| 487 | // process the tag |
| 488 | parser.sawRoot = true |
| 489 | parser.tags.push(parser.tag) |
| 490 | emitNode(parser, "onopentag", parser.tag) |
| 491 | if (!selfClosing) { |
| 492 | // special case for <script> in non-strict mode. |
| 493 | if (!parser.noscript && parser.tagName.toLowerCase() === "script") { |
| 494 | parser.state = S.SCRIPT |
| 495 | } else { |
| 496 | parser.state = S.TEXT |
| 497 | } |
| 498 | parser.tag = null |
| 499 | parser.tagName = "" |
| 500 | } |
| 501 | parser.attribName = parser.attribValue = "" |
| 502 | parser.attribList.length = 0 |
| 503 | } |
| 504 | |
| 505 | function closeTag (parser) { |
| 506 | if (!parser.tagName) { |
| 507 | strictFail(parser, "Weird empty close tag.") |
| 508 | parser.textNode += "</>" |
| 509 | parser.state = S.TEXT |
| 510 | return |
| 511 | } |
| 512 | // first make sure that the closing tag actually exists. |
| 513 | // <a><b></c></b></a> will close everything, otherwise. |
| 514 | var t = parser.tags.length |
| 515 | var tagName = parser.tagName |
| 516 | if (!parser.strict) tagName = tagName[parser.tagCase]() |
| 517 | var closeTo = tagName |
| 518 | while (t --) { |
| 519 | var close = parser.tags[t] |
| 520 | if (close.name !== closeTo) { |
| 521 | // fail the first time in strict mode |
| 522 | strictFail(parser, "Unexpected close tag") |
| 523 | } else break |
| 524 | } |
| 525 | |
| 526 | // didn't find it. we already failed for strict, so just abort. |
| 527 | if (t < 0) { |
| 528 | strictFail(parser, "Unmatched closing tag: "+parser.tagName) |
| 529 | parser.textNode += "</" + parser.tagName + ">" |
| 530 | parser.state = S.TEXT |
| 531 | return |
| 532 | } |
| 533 | parser.tagName = tagName |
| 534 | var s = parser.tags.length |
| 535 | while (s --> t) { |
| 536 | var tag = parser.tag = parser.tags.pop() |
| 537 | parser.tagName = parser.tag.name |
| 538 | emitNode(parser, "onclosetag", parser.tagName) |
| 539 | |
| 540 | var x = {} |
| 541 | for (var i in tag.ns) x[i] = tag.ns[i] |
| 542 | |
| 543 | var parent = parser.tags[parser.tags.length - 1] || parser |
| 544 | if (parser.opt.xmlns && tag.ns !== parent.ns) { |
| 545 | // remove namespace bindings introduced by tag |
| 546 | Object.keys(tag.ns).forEach(function (p) { |
| 547 | var n = tag.ns[p] |
| 548 | emitNode(parser, "onclosenamespace", { prefix: p, uri: n }) |
| 549 | }) |
| 550 | } |
| 551 | } |
| 552 | if (t === 0) parser.closedRoot = true |
| 553 | parser.tagName = parser.attribValue = parser.attribName = "" |
| 554 | parser.attribList.length = 0 |
| 555 | parser.state = S.TEXT |
| 556 | } |
| 557 | |
| 558 | function parseEntity (parser) { |
| 559 | var entity = parser.entity.toLowerCase() |
| 560 | , num |
| 561 | , numStr = "" |
| 562 | if (parser.ENTITIES[entity]) return parser.ENTITIES[entity] |
| 563 | if (entity.charAt(0) === "#") { |
| 564 | if (entity.charAt(1) === "x") { |
| 565 | entity = entity.slice(2) |
| 566 | num = parseInt(entity, 16) |
| 567 | numStr = num.toString(16) |
| 568 | } else { |
| 569 | entity = entity.slice(1) |
| 570 | num = parseInt(entity, 10) |
| 571 | numStr = num.toString(10) |
| 572 | } |
| 573 | } |
| 574 | entity = entity.replace(/^0+/, "") |
| 575 | if (numStr.toLowerCase() !== entity) { |
| 576 | strictFail(parser, "Invalid character entity") |
| 577 | return "&"+parser.entity + ";" |
| 578 | } |
| 579 | return String.fromCharCode(num) |
| 580 | } |
| 581 | |
| 582 | function write (chunk) { |
| 583 | var parser = this |
| 584 | if (this.error) throw this.error |
| 585 | if (parser.closed) return error(parser, |
| 586 | "Cannot write after close. Assign an onready handler.") |
| 587 | if (chunk === null) return end(parser) |
| 588 | var i = 0, c = "" |
| 589 | while (parser.c = c = chunk.charAt(i++)) { |
| 590 | parser.position ++ |
| 591 | if (c === "\n") { |
| 592 | parser.line ++ |
| 593 | parser.column = 0 |
| 594 | } else parser.column ++ |
| 595 | switch (parser.state) { |
| 596 | |
| 597 | case S.BEGIN: |
| 598 | if (c === "<") parser.state = S.OPEN_WAKA |
| 599 | else if (not(whitespace,c)) { |
| 600 | // have to process this as a text node. |
| 601 | // weird, but happens. |
| 602 | strictFail(parser, "Non-whitespace before first tag.") |
| 603 | parser.textNode = c |
| 604 | parser.state = S.TEXT |
| 605 | } |
| 606 | continue |
| 607 | |
| 608 | case S.TEXT: |
| 609 | if (parser.sawRoot && !parser.closedRoot) { |
| 610 | var starti = i-1 |
| 611 | while (c && c!=="<" && c!=="&") { |
| 612 | c = chunk.charAt(i++) |
| 613 | if (c) { |
| 614 | parser.position ++ |
| 615 | if (c === "\n") { |
| 616 | parser.line ++ |
| 617 | parser.column = 0 |
| 618 | } else parser.column ++ |
| 619 | } |
| 620 | } |
| 621 | parser.textNode += chunk.substring(starti, i-1) |
| 622 | } |
| 623 | if (c === "<") parser.state = S.OPEN_WAKA |
| 624 | else { |
| 625 | if (not(whitespace, c) && (!parser.sawRoot || parser.closedRoot)) |
| 626 | strictFail("Text data outside of root node.") |
| 627 | if (c === "&") parser.state = S.TEXT_ENTITY |
| 628 | else parser.textNode += c |
| 629 | } |
| 630 | continue |
| 631 | |
| 632 | case S.SCRIPT: |
| 633 | // only non-strict |
| 634 | if (c === "<") { |
| 635 | parser.state = S.SCRIPT_ENDING |
| 636 | } else parser.script += c |
| 637 | continue |
| 638 | |
| 639 | case S.SCRIPT_ENDING: |
| 640 | if (c === "/") { |
| 641 | emitNode(parser, "onscript", parser.script) |
| 642 | parser.state = S.CLOSE_TAG |
| 643 | parser.script = "" |
| 644 | parser.tagName = "" |
| 645 | } else { |
| 646 | parser.script += "<" + c |
| 647 | parser.state = S.SCRIPT |
| 648 | } |
| 649 | continue |
| 650 | |
| 651 | case S.OPEN_WAKA: |
| 652 | // either a /, ?, !, or text is coming next. |
| 653 | if (c === "!") { |
| 654 | parser.state = S.SGML_DECL |
| 655 | parser.sgmlDecl = "" |
| 656 | } else if (is(whitespace, c)) { |
| 657 | // wait for it... |
| 658 | } else if (is(nameStart,c)) { |
| 659 | parser.startTagPosition = parser.position - 1 |
| 660 | parser.state = S.OPEN_TAG |
| 661 | parser.tagName = c |
| 662 | } else if (c === "/") { |
| 663 | parser.startTagPosition = parser.position - 1 |
| 664 | parser.state = S.CLOSE_TAG |
| 665 | parser.tagName = "" |
| 666 | } else if (c === "?") { |
| 667 | parser.state = S.PROC_INST |
| 668 | parser.procInstName = parser.procInstBody = "" |
| 669 | } else { |
| 670 | strictFail(parser, "Unencoded <") |
| 671 | parser.textNode += "<" + c |
| 672 | parser.state = S.TEXT |
| 673 | } |
| 674 | continue |
| 675 | |
| 676 | case S.SGML_DECL: |
| 677 | if ((parser.sgmlDecl+c).toUpperCase() === CDATA) { |
| 678 | emitNode(parser, "onopencdata") |
| 679 | parser.state = S.CDATA |
| 680 | parser.sgmlDecl = "" |
| 681 | parser.cdata = "" |
| 682 | } else if (parser.sgmlDecl+c === "--") { |
| 683 | parser.state = S.COMMENT |
| 684 | parser.comment = "" |
| 685 | parser.sgmlDecl = "" |
| 686 | } else if ((parser.sgmlDecl+c).toUpperCase() === DOCTYPE) { |
| 687 | parser.state = S.DOCTYPE |
| 688 | if (parser.doctype || parser.sawRoot) strictFail(parser, |
| 689 | "Inappropriately located doctype declaration") |
| 690 | parser.doctype = "" |
| 691 | parser.sgmlDecl = "" |
| 692 | } else if (c === ">") { |
| 693 | emitNode(parser, "onsgmldeclaration", parser.sgmlDecl) |
| 694 | parser.sgmlDecl = "" |
| 695 | parser.state = S.TEXT |
| 696 | } else if (is(quote, c)) { |
| 697 | parser.state = S.SGML_DECL_QUOTED |
| 698 | parser.sgmlDecl += c |
| 699 | } else parser.sgmlDecl += c |
| 700 | continue |
| 701 | |
| 702 | case S.SGML_DECL_QUOTED: |
| 703 | if (c === parser.q) { |
| 704 | parser.state = S.SGML_DECL |
| 705 | parser.q = "" |
| 706 | } |
| 707 | parser.sgmlDecl += c |
| 708 | continue |
| 709 | |
| 710 | case S.DOCTYPE: |
| 711 | if (c === ">") { |
| 712 | parser.state = S.TEXT |
| 713 | emitNode(parser, "ondoctype", parser.doctype) |
| 714 | parser.doctype = true // just remember that we saw it. |
| 715 | } else { |
| 716 | parser.doctype += c |
| 717 | if (c === "[") parser.state = S.DOCTYPE_DTD |
| 718 | else if (is(quote, c)) { |
| 719 | parser.state = S.DOCTYPE_QUOTED |
| 720 | parser.q = c |
| 721 | } |
| 722 | } |
| 723 | continue |
| 724 | |
| 725 | case S.DOCTYPE_QUOTED: |
| 726 | parser.doctype += c |
| 727 | if (c === parser.q) { |
| 728 | parser.q = "" |
| 729 | parser.state = S.DOCTYPE |
| 730 | } |
| 731 | continue |
| 732 | |
| 733 | case S.DOCTYPE_DTD: |
| 734 | parser.doctype += c |
| 735 | if (c === "]") parser.state = S.DOCTYPE |
| 736 | else if (is(quote,c)) { |
| 737 | parser.state = S.DOCTYPE_DTD_QUOTED |
| 738 | parser.q = c |
| 739 | } |
| 740 | continue |
| 741 | |
| 742 | case S.DOCTYPE_DTD_QUOTED: |
| 743 | parser.doctype += c |
| 744 | if (c === parser.q) { |
| 745 | parser.state = S.DOCTYPE_DTD |
| 746 | parser.q = "" |
| 747 | } |
| 748 | continue |
| 749 | |
| 750 | case S.COMMENT: |
| 751 | if (c === "-") parser.state = S.COMMENT_ENDING |
| 752 | else parser.comment += c |
| 753 | continue |
| 754 | |
| 755 | case S.COMMENT_ENDING: |
| 756 | if (c === "-") { |
| 757 | parser.state = S.COMMENT_ENDED |
| 758 | parser.comment = textopts(parser.opt, parser.comment) |
| 759 | if (parser.comment) emitNode(parser, "oncomment", parser.comment) |
| 760 | parser.comment = "" |
| 761 | } else { |
| 762 | parser.comment += "-" + c |
| 763 | parser.state = S.COMMENT |
| 764 | } |
| 765 | continue |
| 766 | |
| 767 | case S.COMMENT_ENDED: |
| 768 | if (c !== ">") { |
| 769 | strictFail(parser, "Malformed comment") |
| 770 | // allow <!-- blah -- bloo --> in non-strict mode, |
| 771 | // which is a comment of " blah -- bloo " |
| 772 | parser.comment += "--" + c |
| 773 | parser.state = S.COMMENT |
| 774 | } else parser.state = S.TEXT |
| 775 | continue |
| 776 | |
| 777 | case S.CDATA: |
| 778 | if (c === "]") parser.state = S.CDATA_ENDING |
| 779 | else parser.cdata += c |
| 780 | continue |
| 781 | |
| 782 | case S.CDATA_ENDING: |
| 783 | if (c === "]") parser.state = S.CDATA_ENDING_2 |
| 784 | else { |
| 785 | parser.cdata += "]" + c |
| 786 | parser.state = S.CDATA |
| 787 | } |
| 788 | continue |
| 789 | |
| 790 | case S.CDATA_ENDING_2: |
| 791 | if (c === ">") { |
| 792 | if (parser.cdata) emitNode(parser, "oncdata", parser.cdata) |
| 793 | emitNode(parser, "onclosecdata") |
| 794 | parser.cdata = "" |
| 795 | parser.state = S.TEXT |
| 796 | } else if (c === "]") { |
| 797 | parser.cdata += "]" |
| 798 | } else { |
| 799 | parser.cdata += "]]" + c |
| 800 | parser.state = S.CDATA |
| 801 | } |
| 802 | continue |
| 803 | |
| 804 | case S.PROC_INST: |
| 805 | if (c === "?") parser.state = S.PROC_INST_ENDING |
| 806 | else if (is(whitespace, c)) parser.state = S.PROC_INST_BODY |
| 807 | else parser.procInstName += c |
| 808 | continue |
| 809 | |
| 810 | case S.PROC_INST_BODY: |
| 811 | if (!parser.procInstBody && is(whitespace, c)) continue |
| 812 | else if (c === "?") parser.state = S.PROC_INST_ENDING |
| 813 | else if (is(quote, c)) { |
| 814 | parser.state = S.PROC_INST_QUOTED |
| 815 | parser.q = c |
| 816 | parser.procInstBody += c |
| 817 | } else parser.procInstBody += c |
| 818 | continue |
| 819 | |
| 820 | case S.PROC_INST_ENDING: |
| 821 | if (c === ">") { |
| 822 | emitNode(parser, "onprocessinginstruction", { |
| 823 | name : parser.procInstName, |
| 824 | body : parser.procInstBody |
| 825 | }) |
| 826 | parser.procInstName = parser.procInstBody = "" |
| 827 | parser.state = S.TEXT |
| 828 | } else { |
| 829 | parser.procInstBody += "?" + c |
| 830 | parser.state = S.PROC_INST_BODY |
| 831 | } |
| 832 | continue |
| 833 | |
| 834 | case S.PROC_INST_QUOTED: |
| 835 | parser.procInstBody += c |
| 836 | if (c === parser.q) { |
| 837 | parser.state = S.PROC_INST_BODY |
| 838 | parser.q = "" |
| 839 | } |
| 840 | continue |
| 841 | |
| 842 | case S.OPEN_TAG: |
| 843 | if (is(nameBody, c)) parser.tagName += c |
| 844 | else { |
| 845 | newTag(parser) |
| 846 | if (c === ">") openTag(parser) |
| 847 | else if (c === "/") parser.state = S.OPEN_TAG_SLASH |
| 848 | else { |
| 849 | if (not(whitespace, c)) strictFail( |
| 850 | parser, "Invalid character in tag name") |
| 851 | parser.state = S.ATTRIB |
| 852 | } |
| 853 | } |
| 854 | continue |
| 855 | |
| 856 | case S.OPEN_TAG_SLASH: |
| 857 | if (c === ">") { |
| 858 | openTag(parser, true) |
| 859 | closeTag(parser) |
| 860 | } else { |
| 861 | strictFail(parser, "Forward-slash in opening tag not followed by >") |
| 862 | parser.state = S.ATTRIB |
| 863 | } |
| 864 | continue |
| 865 | |
| 866 | case S.ATTRIB: |
| 867 | // haven't read the attribute name yet. |
| 868 | if (is(whitespace, c)) continue |
| 869 | else if (c === ">") openTag(parser) |
| 870 | else if (c === "/") parser.state = S.OPEN_TAG_SLASH |
| 871 | else if (is(nameStart, c)) { |
| 872 | parser.attribName = c |
| 873 | parser.attribValue = "" |
| 874 | parser.state = S.ATTRIB_NAME |
| 875 | } else strictFail(parser, "Invalid attribute name") |
| 876 | continue |
| 877 | |
| 878 | case S.ATTRIB_NAME: |
| 879 | if (c === "=") parser.state = S.ATTRIB_VALUE |
| 880 | else if (is(whitespace, c)) parser.state = S.ATTRIB_NAME_SAW_WHITE |
| 881 | else if (is(nameBody, c)) parser.attribName += c |
| 882 | else strictFail(parser, "Invalid attribute name") |
| 883 | continue |
| 884 | |
| 885 | case S.ATTRIB_NAME_SAW_WHITE: |
| 886 | if (c === "=") parser.state = S.ATTRIB_VALUE |
| 887 | else if (is(whitespace, c)) continue |
| 888 | else { |
| 889 | strictFail(parser, "Attribute without value") |
| 890 | parser.tag.attributes[parser.attribName] = "" |
| 891 | parser.attribValue = "" |
| 892 | emitNode(parser, "onattribute", |
| 893 | { name : parser.attribName, value : "" }) |
| 894 | parser.attribName = "" |
| 895 | if (c === ">") openTag(parser) |
| 896 | else if (is(nameStart, c)) { |
| 897 | parser.attribName = c |
| 898 | parser.state = S.ATTRIB_NAME |
| 899 | } else { |
| 900 | strictFail(parser, "Invalid attribute name") |
| 901 | parser.state = S.ATTRIB |
| 902 | } |
| 903 | } |
| 904 | continue |
| 905 | |
| 906 | case S.ATTRIB_VALUE: |
| 907 | if (is(whitespace, c)) continue |
| 908 | else if (is(quote, c)) { |
| 909 | parser.q = c |
| 910 | parser.state = S.ATTRIB_VALUE_QUOTED |
| 911 | } else { |
| 912 | strictFail(parser, "Unquoted attribute value") |
| 913 | parser.state = S.ATTRIB_VALUE_UNQUOTED |
| 914 | parser.attribValue = c |
| 915 | } |
| 916 | continue |
| 917 | |
| 918 | case S.ATTRIB_VALUE_QUOTED: |
| 919 | if (c !== parser.q) { |
| 920 | if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_Q |
| 921 | else parser.attribValue += c |
| 922 | continue |
| 923 | } |
| 924 | attrib(parser) |
| 925 | parser.q = "" |
| 926 | parser.state = S.ATTRIB |
| 927 | continue |
| 928 | |
| 929 | case S.ATTRIB_VALUE_UNQUOTED: |
| 930 | if (not(attribEnd,c)) { |
| 931 | if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_U |
| 932 | else parser.attribValue += c |
| 933 | continue |
| 934 | } |
| 935 | attrib(parser) |
| 936 | if (c === ">") openTag(parser) |
| 937 | else parser.state = S.ATTRIB |
| 938 | continue |
| 939 | |
| 940 | case S.CLOSE_TAG: |
| 941 | if (!parser.tagName) { |
| 942 | if (is(whitespace, c)) continue |
| 943 | else if (not(nameStart, c)) strictFail(parser, |
| 944 | "Invalid tagname in closing tag.") |
| 945 | else parser.tagName = c |
| 946 | } |
| 947 | else if (c === ">") closeTag(parser) |
| 948 | else if (is(nameBody, c)) parser.tagName += c |
| 949 | else { |
| 950 | if (not(whitespace, c)) strictFail(parser, |
| 951 | "Invalid tagname in closing tag") |
| 952 | parser.state = S.CLOSE_TAG_SAW_WHITE |
| 953 | } |
| 954 | continue |
| 955 | |
| 956 | case S.CLOSE_TAG_SAW_WHITE: |
| 957 | if (is(whitespace, c)) continue |
| 958 | if (c === ">") closeTag(parser) |
| 959 | else strictFail("Invalid characters in closing tag") |
| 960 | continue |
| 961 | |
| 962 | case S.TEXT_ENTITY: |
| 963 | case S.ATTRIB_VALUE_ENTITY_Q: |
| 964 | case S.ATTRIB_VALUE_ENTITY_U: |
| 965 | switch(parser.state) { |
| 966 | case S.TEXT_ENTITY: |
| 967 | var returnState = S.TEXT, buffer = "textNode" |
| 968 | break |
| 969 | |
| 970 | case S.ATTRIB_VALUE_ENTITY_Q: |
| 971 | var returnState = S.ATTRIB_VALUE_QUOTED, buffer = "attribValue" |
| 972 | break |
| 973 | |
| 974 | case S.ATTRIB_VALUE_ENTITY_U: |
| 975 | var returnState = S.ATTRIB_VALUE_UNQUOTED, buffer = "attribValue" |
| 976 | break |
| 977 | } |
| 978 | if (c === ";") { |
| 979 | parser[buffer] += parseEntity(parser) |
| 980 | parser.entity = "" |
| 981 | parser.state = returnState |
| 982 | } |
| 983 | else if (is(entity, c)) parser.entity += c |
| 984 | else { |
| 985 | strictFail("Invalid character entity") |
| 986 | parser[buffer] += "&" + parser.entity + c |
| 987 | parser.entity = "" |
| 988 | parser.state = returnState |
| 989 | } |
| 990 | continue |
| 991 | |
| 992 | default: |
| 993 | throw new Error(parser, "Unknown state: " + parser.state) |
| 994 | } |
| 995 | } // while |
| 996 | // cdata blocks can get very big under normal conditions. emit and move on. |
| 997 | // if (parser.state === S.CDATA && parser.cdata) { |
| 998 | // emitNode(parser, "oncdata", parser.cdata) |
| 999 | // parser.cdata = "" |
| 1000 | // } |
| 1001 | if (parser.position >= parser.bufferCheckPosition) checkBufferLength(parser) |
| 1002 | return parser |
| 1003 | } |
| 1004 | |
| 1005 | })(typeof exports === "undefined" ? sax = {} : exports) |