<?php
class FragmentsDocumentParser extends BaseFragmentParser
{
private $fragments = [];
protected $ignoreRanges = [];
private $isParsingScript = false;
public function setIgnoreRanges($ignoreRanges)
{
$this->ignoreRanges = $ignoreRanges;
}
private function parseFragment($index)
{
$this->buffer = '';
$brokeEarly = false;
for ($i = $index; $i < count($this->string); $i++) {
if (array_key_exists($i, $this->ignoreRanges)) {
$jumpIndex = $this->ignoreRanges[$i];
$this->buffer .= str(' ')->repeat($jumpIndex - $i + 1);
$i = $jumpIndex;
continue;
}
$this->checkCurrentOffsets($i);
if (($i != $index && $this->current == '<') ||
($this->current != '>' && $this->next == null)) {
$brokeEarly = true;
break;
}
if ($this->isStartOfString()) {
$i = $this->scanToEndOfString($i);
continue;
}
if ($this->current == '>') {
$this->buffer .= $this->current;
break;
}
$this->buffer .= $this->current;
}
if ($brokeEarly) {
$this->buffer = '';
return;
}
$fragment = new HtmlFragment();
$fragment->startPosition = $index;
$fragment->endPosition = $this->position;
$fragment->content = $this->buffer;
$fragment->isSelfClosing = str($this->buffer)->endsWith('/>');
$fragment->isClosingTag = str($this->buffer)->startsWith('</');
$documentContentStartOffset = 1;
$documentContentEndOffset = -1;
if ($fragment->isClosingTag) {
$documentContentStartOffset = 2;
}
if ($fragment->isSelfClosing) {
$documentContentEndOffset = -2;
}
$fragment->documentContent =
str($this->buffer)->substr(
$documentContentStartOffset,
$documentContentEndOffset
);
// Retrieve all content before the first space.
$tagName = str($fragment->documentContent)->before(' ');
$fragment->tagName = (string) $tagName->trim();
// Check if the tag name was an ignored region.
// If so, we can grab that content substring.
if (array_key_exists(
$fragment->startPosition + $documentContentStartOffset
, $this->ignoreRanges
)) {
$tagNameStart =
$fragment->startPosition + $documentContentStartOffset;
$tagNameEnd = $this->ignoreRanges[$tagNameStart];
$nameLength = $tagNameEnd - $tagNameStart + 1;
$fragment->tagName = str($this->string)
->substr($tagNameStart, $nameLength)->value();
}
// Create a Fragment representing the name.
if (str($fragment->tagName)->trim()->length > 0) {
$fragment->name = new Fragment();
$fragment->name->content = $fragment->tagName;
$fragment->name->startPosition =
$fragment->startPosition + $documentContentStartOffset;
$fragment->name->endPosition =
$fragment->name->startPosition +
str($fragment->tagName)->length() - 1;
}
// Calculate the start of the inner content.
// This will be the first space after the tag name.
$innerContentStart = mb_strpos($fragment->documentContent,
' ');
if ($innerContentStart !== false) {
$innerContentFragment = new Fragment();
$innerContentFragment->content =
str($fragment->documentContent)
->substr($innerContentStart)
->trim();
// Calculate the start and end positions of the
// inner content relative to the document.
$innerContentFragment->startPosition = mb_strpos(
$fragment->documentContent,
$innerContentFragment->content
) + 1 + $fragment->startPosition;
if ($fragment->isClosingTag) {
$innerContentFragment->startPosition += 1;
}
$innerContentFragment->endPosition =
$innerContentFragment->startPosition +
str($innerContentFragment->content)->length();
$fragment->innerContent = $innerContentFragment;
}
if (!$fragment->isClosingTag && !$fragment->isSelfClosing &&
str($fragment->tagName)->lower == 'script'
) {
$this->isParsingScript = true;
}
$this->fragments[] = $fragment;
}
/**
* @return array
*/
private function buildFragmentIndex()
{
preg_match_all('/</', $this->string,
$matches, PREG_OFFSET_CAPTURE);
$fragmentStarts = [];
foreach ($matches[0] as $match) {
$index = $match[1];
$isValid = true;
foreach ($this->ignoreRanges as $rangeStart => $rangeEnd) {
if ($index >= $rangeStart && $index <= $rangeEnd) {
$isValid = false;
break;
}
}
if ($isValid) {
$fragmentStarts[] = $index;
}
}
return $fragmentStarts;
}
public function getFragments()
{
return $this->fragments;
}
public function parse($value)
{
$this->isParsingScript = false;
$this->string = new Utf8StringIterator(
Str::normalizeEol($value, "\n")
);
$fragmentStartIndexes = $this->buildFragmentIndex();
$fragmentCount = count($fragmentStartIndexes);
for ($i = 0; $i < $fragmentCount; $i++) {
$this->parseFragment($fragmentStartIndexes[$i]);
$this->resetState();
if ($this->isParsingScript) {
for ($j = $i + 1; $j < $fragmentCount; $j++) {
$start = $fragmentStartIndexes[$j];
$check = str($this->string)
->substr($start, 8)
->lower();
if ($check == '</script') {
$this->isParsingScript = false;
$i = $j - 1;
break;
}
}
}
}
return $this->fragments;
}
}