|
| 1 | +# html-parser |
| 2 | + |
| 3 | +```js |
| 4 | +/** |
| 5 | + * Not type-checking this file because it's mostly vendo
10000
r code. |
| 6 | + */ |
| 7 | + |
| 8 | +/*! |
| 9 | + * HTML Parser By John Resig (ejohn.org) |
| 10 | + * Modified by Juriy "kangax" Zaytsev |
| 11 | + * Original code by Erik Arvidsson (MPL-1.1 OR Apache-2.0 OR GPL-2.0-or-later) |
| 12 | + * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js |
| 13 | + */ |
| 14 | + |
| 15 | +import { makeMap, no } from 'shared/util' |
| 16 | +import { isNonPhrasingTag } from 'web/compiler/util' |
| 17 | +import { unicodeRegExp } from 'core/util/lang' |
| 18 | + |
| 19 | +// Regular Expressions for parsing tags and attributes |
| 20 | +const attribute = /^\s*([^\s"'<>\/=]+)(?:\s*(=)\s*(?:"([^"]*)"+|'([^']*)'+|([^\s"'=<>`]+)))?/ |
| 21 | +const dynamicArgAttribute = /^\s*((?:v-[\w-]+:|@|:|#)\[[^=]+\][^\s"'<>\/=]*)(?:\s*(=)\s*(?:"([^"]*)"+|'([^']*)'+|([^\s"'=<>`]+)))?/ |
| 22 | +const ncname = `[a-zA-Z_][\\-\\.0-9_a-zA-Z${unicodeRegExp.source}]*` |
| 23 | +const qnameCapture = `((?:${ncname}\\:)?${ncname})` |
| 24 | +const startTagOpen = new RegExp(`^<${qnameCapture}`) |
| 25 | +const startTagClose = /^\s*(\/?)>/ |
| 26 | +const endTag = new RegExp(`^<\\/${qnameCapture}[^>]*>`) |
| 27 | +const doctype = /^<!DOCTYPE [^>]+>/i |
| 28 | +// #7298: escape - to avoid being passed as HTML comment when inlined in page |
| 29 | +const comment = /^<!\--/ |
| 30 | +const conditionalComment = /^<!\[/ |
| 31 | + |
| 32 | +// Special Elements (can contain anything) |
| 33 | +export const isPlainTextElement = makeMap('script,style,textarea', true) |
| 34 | +const reCache = {} |
| 35 | + |
| 36 | +const decodingMap = { |
| 37 | + '<': '<', |
| 38 | + '>': '>', |
| 39 | + '"': '"', |
| 40 | + '&': '&', |
| 41 | + ' ': '\n', |
| 42 | + '	': '\t', |
| 43 | + ''': "'" |
| 44 | +} |
| 45 | +const encodedAttr = /&(?:lt|gt|quot|amp|#39);/g |
| 46 | +const encodedAttrWithNewLines = /&(?:lt|gt|quot|amp|#39|#10|#9);/g |
| 47 | + |
| 48 | +// #5992 |
| 49 | +const isIgnoreNewlineTag = makeMap('pre,textarea', true) |
| 50 | +const shouldIgnoreFirstNewline = (tag, html) => tag && isIgnoreNewlineTag(tag) && html[0] === '\n' |
| 51 | + |
| 52 | +function decodeAttr (value, shouldDecodeNewlines) { |
| 53 | + const re = shouldDecodeNewlines ? encodedAttrWithNewLines : encodedAttr |
| 54 | + return value.replace(re, match => decodingMap[match]) |
| 55 | +} |
| 56 | + |
| 57 | +export function parseHTML (html, options) { |
| 58 | + const stack = [] |
| 59 | + const expectHTML = options.expectHTML |
| 60 | + // 不是成对的标签 |
| 61 | + const isUnaryTag = options.isUnaryTag || no |
| 62 | + const canBeLeftOpenTag = options.canBeLeftOpenTag || no |
| 63 | + // 指针 |
| 64 | + let index = 0 |
| 65 | + let last, lastTag |
| 66 | + while (html) { |
| 67 | + last = html |
| 68 | + // Make sure we're not in a plaintext content element like script/style |
| 69 | + if (!lastTag || !isPlainTextElement(lastTag)) { |
| 70 | + let textEnd = html.indexOf('<') |
| 71 | + if (textEnd === 0) { |
| 72 | + // Comment: |
| 73 | + if (comment.test(html)) { |
| 74 | + const commentEnd = html.indexOf('-->') |
| 75 | + |
| 76 | + if (commentEnd >= 0) { |
| 77 | + if (options.shouldKeepComment) { |
| 78 | + options.comment(html.substring(4, commentEnd), index, index + commentEnd + 3) |
| 79 | + } |
| 80 | + advance(commentEnd + 3) |
| 81 | + continue |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + // http://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment |
| 86 | + if (conditionalComment.test(html)) { |
| 87 | + const conditionalEnd = html.indexOf(']>') |
| 88 | + |
| 89 | + if (conditionalEnd >= 0) { |
| 90 | + advance(conditionalEnd + 2) |
| 91 | + continue |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + // Doctype: |
| 96 | + const doctypeMatch = html.match(doctype) |
| 97 | + if (doctypeMatch) { |
| 98 | + advance(doctypeMatch[0].length) |
| 99 | + continue |
| 100 | + } |
| 101 | + |
| 102 | + // End tag: |
| 103 | + const endTagMatch = html.match(endTag) |
| 104 | + if (endTagMatch) { |
| 105 | + const curIndex = index |
| 106 | + advance(endTagMatch[0].length) |
| 107 | + parseEndTag(endTagMatch[1], curIndex, index) |
| 108 | + continue |
| 109 | + } |
| 110 | + |
| 111 | + // Start tag: |
| 112 | + const startTagMatch = parseStartTag() |
| 113 | + if (startTagMatch) { |
| 114 | + handleStartTag(startTagMatch) |
| 115 | + if (shouldIgnoreFirstNewline(startTagMatch.tagName, html)) { |
| 116 | + advance(1) |
| 117 | + } |
| 118 | + continue |
| 119 | + } |
| 120 | + } |
| 121 | + |
| 122 | + let text, rest, next |
| 123 | + if (textEnd >= 0) { |
| 124 | + rest = html.slice(textEnd) |
| 125 | + while ( |
| 126 | + !endTag.test(rest) && |
| 127 | + !startTagOpen.test(rest) && |
| 128 | + !comment.test(rest) && |
| 129 | + !conditionalComment.test(rest) |
| 130 | + ) { |
| 131 | + // < in plain text, be forgiving and treat it as text |
| 132 | + next = rest.indexOf('<', 1) |
| 133 | + if (next < 0) break |
| 134 | + textEnd += next |
| 135 | + rest = html.slice(textEnd) |
| 136 | + } |
| 137 | + text = html.substring(0, textEnd) |
| 138 | + } |
| 139 | + |
| 140 | + if (textEnd < 0) { |
| 141 | + text = html |
| 142 | + } |
| 143 | + |
| 144 | + if (text) { |
| 145 | + advance(text.length) |
| 146 | + } |
| 147 | + |
| 148 | + if (options.chars && text) { |
| 149 | + options.chars(text, index - text.length, index) |
| 150 | + } |
| 151 | + } else { |
| 152 | + let endTagLength = 0 |
| 153 | + const stackedTag = lastTag.toLowerCase() |
| 154 | + const reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)(</' + stackedTag + '[^>]*>)', 'i')) |
| 155 | + const rest = html.replace(reStackedTag, function (all, text, endTag) { |
| 156 | + endTagLength = endTag.length |
| 157 | + if (!isPlainTextElement(stackedTag) && stackedTag !== 'noscript') { |
| 158 | + text = text |
| 159 | + .replace(/<!\--([\s\S]*?)-->/g, '$1') // #7298 |
| 160 | + .replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1') |
| 161 | + } |
| 162 | + if (shouldIgnoreFirstNewline(stackedTag, text)) { |
| 163 | + text = text.slice(1) |
| 164 | + } |
| 165 | + if (options.chars) { |
| 166 | + options.chars(text) |
| 167 | + } |
| 168 | + return '' |
| 169 | + }) |
| 170 | + index += html.length - rest.length |
| 171 | + html = rest |
| 172 | + parseEndTag(stackedTag, index - endTagLength, index) |
| 173 | + } |
| 174 | + |
| 175 | + if (html === last) { |
| 176 | + options.chars && options.chars(html) |
| 177 | + if (process.env.NODE_ENV !== 'production' && !stack.length && options.warn) { |
| 178 | + options.warn(`Mal-formatted tag at end of template: "${html}"`, { start: index + html.length }) |
| 179 | + } |
| 180 | + break |
| 181 | + } |
| 182 | + } |
| 183 | + |
| 184 | + // Clean up any remaining tags |
| 185 | + parseEndTag() |
| 186 | + |
| 187 | + function advance (n) { |
| 188 | + index += n |
| 189 | + html = html.substring(n) |
| 190 | + } |
| 191 | + |
| 192 | + function parseStartTag () { |
| 193 | + const start = html.match(startTagOpen) |
| 194 | + if (start) { |
| 195 | + const match = { |
| 196 | + tagName: start[1], |
| 197 | + attrs: [], |
| 198 | + start: index |
| 199 | + } |
| 200 | + advance(start[0].length) |
| 201 | + let end, attr |
| 202 | + while (!(end = html.match(startTagClose)) && (attr = html.match(dynamicArgAttribute) || html.match(attribute))) { |
| 203 | + attr.start = index |
| 204 | + advance(attr[0].length) |
| 205 | + attr.end = index |
| 206 | + match.attrs.push(attr) |
| 207 | + } |
| 208 | + if (end) { |
| 209 | + match.unarySlash = end[1] |
| 210 | + advance(end[0].length) |
| 211 | + match.end = index |
| 212 | + return match |
| 213 | + } |
| 214 | + } |
| 215 | + } |
| 216 | + |
| 217 | + function handleStartTag (match) { |
| 218 | + const tagName = match.tagName |
| 219 | + const unarySlash = match.unarySlash |
| 220 | + |
| 221 | + if (expectHTML) { |
| 222 | + if (lastTag === 'p' && isNonPhrasingTag(tagName)) { |
| 223 | +
F438
parseEndTag(lastTag) |
| 224 | + } |
| 225 | + if (canBeLeftOpenTag(tagName) && lastTag === tagName) { |
| 226 | + parseEndTag(tagName) |
| 227 | + } |
| 228 | + } |
| 229 | + |
| 230 | + const unary = isUnaryTag(tagName) || !!unarySlash |
| 231 | + |
| 232 | + const l = match.attrs.length |
| 233 | + const attrs = new Array(l) |
| 234 | + for (let i = 0; i < l; i++) { |
| 235 | + const args = match.attrs[i] |
| 236 | + const value = args[3] || args[4] || args[5] || '' |
| 237 | + const shouldDecodeNewlines = tagName === 'a' && args[1] === 'href' |
| 238 | + ? options.shouldDecodeNewlinesForHref |
| 239 | + : options.shouldDecodeNewlines |
| 240 | + attrs[i] = { |
| 241 | + name: args[1], |
| 242 | + value: decodeAttr(value, shouldDecodeNewlines) |
| 243 | + } |
| 244 | + if (process.env.NODE_ENV !== 'production' && options.outputSourceRange) { |
| 245 | + attrs[i].start = args.start + args[0].match(/^\s*/).length |
| 246 | + attrs[i].end = args.end |
| 247 | + } |
| 248 | + } |
| 249 | + |
| 250 | + if (!unary) { |
| 251 | + stack.push({ tag: tagName, lowerCasedTag: tagName.toLowerCase(), attrs: attrs, start: match.start, end: match.end }) |
| 252 | + lastTag = tagName |
| 253 | + } |
| 254 | + |
| 255 | + if (options.start) { |
| 256 | + options.start(tagName, attrs, unary, match.start, match.end) |
| 257 | + } |
| 258 | + } |
| 259 | + |
| 260 | + function parseEndTag (tagName, start, end) { |
| 261 | + let pos, lowerCasedTagName |
| 262 | + if (start == null) start = index |
| 263 | + if (end == null) end = index |
| 264 | + |
| 265 | + // Find the closest opened tag of the same type |
| 266 | + if (tagName) { |
| 267 | + lowerCasedTagName = tagName.toLowerCase() |
| 268 | + for (pos = stack.length - 1; pos >= 0; pos--) { |
| 269 | + if (stack[pos].lowerCasedTag === lowerCasedTagName) { |
| 270 | + break |
| 271 | + } |
| 272 | + } |
| 273 | + } else { |
| 274 | + // If no tag name is provided, clean shop |
| 275 | + pos = 0 |
| 276 | + } |
| 277 | + |
| 278 | + if (pos >= 0) { |
| 279 | + // Close all the open elements, up the stack |
| 280 | + for (let i = stack.length - 1; i >= pos; i--) { |
| 281 | + if (process.env.NODE_ENV !== 'production' && |
| 282 | + (i > pos || !tagName) && |
| 283 | + options.warn |
| 284 | + ) { |
| 285 | + options.warn( |
| 286 | + `tag <${stack[i].tag}> has no matching end tag.`, |
| 287 | + { start: stack[i].start, end: stack[i].end } |
| 288 | + ) |
| 289 | + } |
| 290 | + if (options.end) { |
| 291 | + options.end(stack[i].tag, start, end) |
| 292 | + } |
| 293 | + } |
| 294 | + |
| 295 | + // Remove the open elements from the stack |
| 296 | + stack.length = pos |
| 297 | + lastTag = pos && stack[pos - 1].tag |
| 298 | + } else if (lowerCasedTagName === 'br') { |
| 299 | + if (options.start) { |
| 300 | + options.start(tagName, [], true, start, end) |
| 301 | + } |
| 302 | + } else if (lowerCasedTagName === 'p') { |
| 303 | + if (options.start) { |
| 304 | + options.start(tagName, [], false, start, end) |
| 305 | + } |
| 306 | + if (options.end) { |
| 307 | + options.end(tagName, start, end) |
| 308 | + } |
| 309 | + } |
| 310 | + } |
| 311 | +} |
| 312 | + |
| 313 | +``` |
| 314 | + |
| 315 | + |
| 316 | + |
| 317 | +**match的内容:** |
| 318 | + |
| 319 | +* tagName: 标签名 |
| 320 | + |
| 321 | +* start:匹配的开始位置 |
| 322 | + |
| 323 | +* end:匹配的结束位置 |
| 324 | + |
| 325 | +* attrs: 标签的属性配置内容 |
| 326 | + |
| 327 | +* unarySlash: |
| 328 | + |
| 329 | + |
| 330 | + |
| 331 | +**attrs的内容:** |
| 332 | + |
| 333 | +* name: 属性的key |
| 334 | +* value:属性的value |
0 commit comments