diff --git a/tests/test_vtt.py b/tests/test_vtt.py index 214b71c..2bab9bb 100644 --- a/tests/test_vtt.py +++ b/tests/test_vtt.py @@ -262,6 +262,41 @@ def test_parse_captions(self): 'Caption text #2 line 2' ) + def test_parse_caption_with_blank_line_after_timing(self): + output = vtt.parse( + textwrap.dedent(''' + WEBVTT + + 00:00:00.500 --> 00:00:05.000 + + Caption text #1 + ''').strip().split('\n') + ) + + self.assertEqual(len(output.captions), 1) + self.assertEqual( + str(output.captions[0]), + '00:00:00.500 00:00:05.000 Caption text #1', + ) + + def test_parse_caption_with_multiple_blank_lines_after_timing(self): + output = vtt.parse( + textwrap.dedent(''' + WEBVTT + + 00:00:00.500 --> 00:00:05.000 + + + Caption text #1 + ''').strip().split('\n') + ) + + self.assertEqual(len(output.captions), 1) + self.assertEqual( + str(output.captions[0]), + '00:00:00.500 00:00:05.000 Caption text #1', + ) + def test_parse_styles(self): output = vtt.parse( textwrap.dedent(''' diff --git a/webvtt/vtt.py b/webvtt/vtt.py index 6726daf..5e7b9df 100644 --- a/webvtt/vtt.py +++ b/webvtt/vtt.py @@ -243,6 +243,40 @@ def format_lines(lines: typing.List[str]) -> typing.List[str]: return ['STYLE', *lines] +def remove_empty_lines( + lines: typing.Sequence[str] + ) -> typing.List[str]: + """ + Remove empty lines appearing directly after cue timings. + + :param lines: original lines of text + :returns: cleaned list of lines + """ + cleaned = [] + total = len(lines) + i = 0 + while i < total: + line = lines[i] + cleaned.append(line) + + # If this line is a cue timing line, check for empty lines after it. + if re.match(WebVTTCueBlock.CUE_TIMINGS_PATTERN, line): + # Skip all empty lines that follow, + # unless the next non-empty line is another timing line. + j = i + 1 + while j < total and not lines[j].strip(): + next_j = j + 1 + # Stop skipping if next non-empty line is also a cue timing. + if next_j < total and re.match(WebVTTCueBlock.CUE_TIMINGS_PATTERN, lines[next_j]): + break + j += 1 + i = j # Continue from after the skipped lines + else: + i += 1 + + return cleaned + + def parse( lines: typing.Sequence[str] ) -> ParserOutput: @@ -255,7 +289,7 @@ def parse( if not is_valid_content(lines): raise MalformedFileError('Invalid format') - return parse_items(lines) + return parse_items(remove_empty_lines(lines)) def is_valid_content(lines: typing.Sequence[str]) -> bool: