# AUTOGENERATED! DO NOT EDIT! File to edit: src/slideslive.ipynb (unless otherwise specified).

__all__ = ['SL_REGEX', 'SL_INFO', 'SL_CDN', 'YODA_CDN', 'url2id', 'get_sl_info', 'parse_slide_xml',
           'get_slide_metadata', 'get_urls', 'download_slides', 'ffmpeg_concat_script', 'compose_ffmpeg_video',
           'SlidesLive']

# Cell
#export
import json
import os
import re
import requests
import tempfile
import time
import warnings

from xml.etree import ElementTree

# Cell
#export
# Parse SlidesLive URL
_SL_REGEX_STR = ('https://slideslive\\.(?:com|de)/'
                 '(?P<id>\\d+)'
                 '/*'
                 '(?P<name>.*)')
SL_REGEX = re.compile(_SL_REGEX_STR)

# SL INFO JSON
SL_INFO = 'https://ben.slideslive.com/player/{id}'

# SL CDNs
SL_CDN = 'https://cdn.slideslive.com/data/presentations/{video_id}/slides/{slide_type}/{slide_id}.jpg'
YODA_CDN = 'https://d2ygwrecguqg66.cloudfront.net/data/presentations/{id}/{data}'
# e.g.: https://d2ygwrecguqg66.cloudfront.net/data/presentations/38956531/slides/big/00793.jpg
#       https://d2ygwrecguqg66.cloudfront.net/data/presentations/38956531/v1/38956531.xml
#       https://d2ygwrecguqg66.cloudfront.net/data/presentations/38956531/v1/slides.json

# Cell
def url2id(sl_url):
    """Convers SlidesLive URL to presentation ID and name."""
    sl_url_match = SL_REGEX.search(sl_url)
    if sl_url_match is None or not sl_url_match.group('id'):
        raise Exception('Could not parse the SlidesLive URL.')

    return sl_url_match.group('id'), sl_url_match.group('name')

# Cell
def get_sl_info(sl_id):
    """Pulls information about a SlidesLive presentation."""
    if (not isinstance(sl_id, int)
          and (isinstance(sl_id, str) and not sl_id.isdecimal())):
        raise TypeError('Incorrect SlidesLive ID format.')

    info_url = SL_INFO.format(id=sl_id)
    info_request = requests.get(info_url)
    info_json = json.loads(info_request.content.decode())

    return info_json

# Cell
#hide
def parse_slide_xml(xml, mode='string'):
    """
    Parse the SlidesLive slide XML metadata.

    `mode` can either be `string` or `file`.
    """
    if mode not in ('string', 'file'):
        raise ValueError('The xml parse mode can either be *string* or *file*.')

    slide_properties = ['orderId', 'timeSec', 'time', 'slideName']

    if mode == 'string':
        xml_root = ElementTree.fromstring(xml)
    else:
        assert mode == 'file'
        with open(xml, 'r') as f:
            xml_tree = ElementTree.parse(f)
        xml_root = xml_tree.getroot()
    if xml_root.tag != 'videoContent':
        raise RuntimeError(f'Cannot process this XML structure: {xml_root.tag}.')

    slides = []
    for node in xml_root:
        if node is None:
            continue
        if node.tag != 'slide':
            raise RuntimeError(f'Unexpected slide type: {node.tag}.')

        slide = {}
        for n in node:
            if n.tag not in slide_properties:
                raise RuntimeError(f'Unexpected slide specifier: {n.tag}.')
            slide[n.tag] = n.text
        slides.append(slide)

    return slides

# Cell
def get_slide_metadata(sl_meta_url, approach='json'):
    """
    Processes metadata of slides associated with a SlidesLive presentation.

    `approach` is one of `json` or `xml`.
    It specifies the strategy for extracting slide metadata.
    """
    if approach not in ('xml', 'json'):
        raise ValueError('The approach can either be *json* or *xml*.')

    meta_request = requests.get(sl_meta_url)
    meta_content = meta_request.content.decode()
    if approach == 'json':
        meta_data = json.loads(meta_content)
    else:
        assert approach == 'xml'
        meta_data_ = parse_slide_xml(meta_content)
        meta_data_ = {int(d['orderId']): {'time': int(d['time']),
                                          'type': 'image',
                                          'image': {'name': d['slideName']}}
                      for d in meta_data_}
        meta_data = {'slides': [meta_data_[i] for i in sorted(meta_data_.keys())]}

    return meta_data

# Cell
def get_urls(video_id, slide_meta, slide_type='big',
             slide=(None, None), time=(None, None)):
    """
    Composes a list of URLs for slides of a given SlidesLive presentation.

    `video_id` specifies the ID of a SlidesLive presentation.
    `slide_meta` is the metadata of a SlidesLive presentation
    as given by the `get_slide_metadata` function.
    `slide_type` specifies the size of the slide.

    A subset of slides may be extracted with this function using either
    the `slide` or `time` parameter (but not both simultaneously).

    The `slide` parameter takes a range of slides to be extracted based
    on the slide ID numbers visible in a SlidesLive presentation.
    For example, `slide=(5, 7)` to extract slides 5--7, **inclusive**;
    `slide=(5, None)` to extract from slide 5 **onwards**; or
    `slide=(None, 6)` to extract up to slide 6 **inclusive**.

    The `time` parameter takes a range of time (visible in a SlidesLive
    presentation) for which slides are to be extracted.
    For example, `time=(5, 10)` to extract slides starting at second 5
    (**inclusive**) and ending before second 10 (**exclusive**);
    `time=(5, None)` to extract from second 5 **onwards**; or
    `time=(None, 50)` to extract up to second 60 **exclusive**.
    """
    if not isinstance(slide, tuple) or len(slide) != 2:
        raise TypeError('Numeric slide bound (slide) must be a 2-tuple.')
    if not isinstance(time, tuple) or len(time) != 2:
        raise TypeError('Time-based slide bound (time) must be a 2-tuple.')

    slide_given = slide[0] is not None or slide[1] is not None
    time_given = time[0] is not None or time[1] is not None
    if slide_given and time_given:
        raise RuntimeError('Both slide and time bounds cannot be used simultaneously.')

    if 'slide_qualities' in slide_meta:
        if slide_type not in slide_meta['slide_qualities']:
            raise ValueError('The slide type (slide_type) is not recognised.')

    slides = []
    if slide_given:
        lower_bound = -float('inf') if slide[0] is None else slide[0]
        upper_bound = float('inf') if slide[1] is None else slide[1]
        for i, s in enumerate(slide_meta['slides']):
            i_ = i + 1
            if i_ >= lower_bound and i_ <= upper_bound:
                slides.append(SL_CDN.format(
                    video_id=video_id,
                    slide_type=slide_type,
                    slide_id=s['image']['name']))
    elif time_given:
        lower_bound = -float('inf') if time[0] is None else time[0]
        upper_bound = float('inf') if time[1] is None else time[1]
        s = slide_meta['slides']
        for i in range(0, len(s) - 1):
            t_start = int(s[i]['time'] / 1000)  # inclusive
            t_end = int(s[i + 1]['time'] / 1000)  # exclusive

            if t_start >= lower_bound and t_end <= upper_bound:
                add_slide = True
            elif (t_start < lower_bound and t_end > lower_bound
                      and t_end < upper_bound):
                add_slide = True
            elif (t_start < upper_bound and t_end > upper_bound
                      and t_start >= lower_bound):
                add_slide = True
            else:
                add_slide = False

            if add_slide:
                slides.append(SL_CDN.format(
                    video_id=video_id,
                    slide_type=slide_type,
                    slide_id=s[i]['image']['name']))
        else:  # handle the last slide
            t_start = int(s[i + 1]['time'] / 1000)  # inclusive
            t_end = None  # exclusive

            if t_start >= lower_bound and t_start < upper_bound:
                slides.append(SL_CDN.format(
                    video_id=video_id,
                    slide_type=slide_type,
                    slide_id=s[i + 1]['image']['name']))
    else:
        slides = [SL_CDN.format(video_id=video_id,
                                slide_type=slide_type,
                                slide_id=s['image']['name'])
                  for s in slide_meta['slides']]

    return slides

# Cell
def download_slides(url_list, sleep_time=.2, jobs=16,
                    directory=None, technique='python'):
    """
    Downloads files from a list of URLs (`url_list`).

    The destination directory is either `slides` created
    in the current working directory, or a path specified
    via the `directory` parameter.

    Three different download strategies are supported:

    * `technique='python'` -- downloads the images through
      Python's `requests` library one by one, pausing for
      `sleep_time` (`0.2` seconds, by default) after each
      download.
    * `technique='wget'` -- downloads the images by invoking
      `wget` for each image in the list, pausing for
      `sleep_time` (`0.2` seconds, by default) after each
      download.
    * `technique='wget+parallel'` -- downloads multiple images
      simultaneously -- specified by the `jobs` parameter
      (`16`, by default)-- by invoking `wget` thorugh `parallel`.
    """
    if technique not in ('python', 'wget', 'wget+parallel'):
        raise ValueError('The download `technique` should be one of: '
                         'python, wget, wget+parallel.')

    if directory is None:
        slides_dir = os.path.join(os.getcwd(), 'slides')
    else:
        slides_dir = directory

    if os.path.exists(slides_dir):
        if not os.path.isdir(slides_dir):
            raise RuntimeError(
                'The slides destination is a file '
                f'and not adirectory.\n({slides_dir})')
    else:
        os.mkdir(slides_dir)

    if technique in ('python', 'wget'):
        for url in url_list:
            fn = os.path.basename(url)
            fn_path = os.path.join(slides_dir, fn)

            if os.path.exists(fn_path):
                if os.path.isfile(fn_path):
                    warnings.warn(f'File {fn_path} already exists; skipping download.')
                else:
                    warnings.warn(f'The file path -- {fn_path} -- is a directory; '
                                  'skipping download.')
            else:
                if technique == 'python':
                    with open(fn_path, 'wb') as f:
                        r = requests.get(url)
                        f.write(r.content)
                else:
                    assert technique == 'wget'
                    stream = os.popen(f'wget -P {slides_dir} {url}')
                    print(stream.read())
                time.sleep(sleep_time)
    else:
        assert technique == 'wget+parallel'
        with tempfile.NamedTemporaryFile(mode='w') as parallel_file:
            parallel_file.write('\n'.join(url_list))
            parallel_file.seek(0)

            stream = os.popen(f'parallel -j {jobs} wget -P {slides_dir} < {parallel_file.name}')
            print(stream.read())

# Cell
def ffmpeg_concat_script(slide_meta, slide_folder=None, last_duration=None,
                         slide=(None, None), time=(None, None)):
    """
    Builds an ffmpeg frame concatination string from slide metadata.
    Since the duration of the very last slide cannot be inferred,
    it lasts for a user-specified amount of time
    (`last_diration`, `5` by default).

    `slide_folder` specifies the location of the slide images.
    By default, it is the `slides` folder in the current
    working directory.

    A subset of slides may be extracted with this function using either
    the `slide` or `time` parameter (but not both simultaneously).

    The `slide` parameter takes a range of slides to be extracted based
    on the slide ID numbers visible in a SlidesLive presentation.
    For example, `slide=(5, 7)` to extract slides 5--7, **inclusive**;
    `slide=(5, None)` to extract from slide 5 **onwards**; or
    `slide=(None, 6)` to extract up to slide 6 **inclusive**.

    The `time` parameter takes a range of time (visible in a SlidesLive
    presentation) for which slides are to be extracted.
    For example, `time=(5, 10)` to extract slides starting at second 5
    (**inclusive**) and ending before second 10 (**exclusive**);
    `time=(5, None)` to extract from second 5 **onwards**; or
    `time=(None, 50)` to extract up to second 60 **exclusive**.
    """
    def _slide_exists(_slide_file):
        _f = os.path.join(slide_folder, f"{_slide_file}.jpg")
        _f = os.path.abspath(_f)
        if not os.path.exists(_f) or not os.path.isfile(_f):
            raise RuntimeError(f'{_f} file does not exist.')
        return _f

    if not isinstance(slide, tuple) or len(slide) != 2:
        raise TypeError('Numeric slide bound (slide) must be a 2-tuple.')
    if not isinstance(time, tuple) or len(time) != 2:
        raise TypeError('Time-based slide bound (time) must be a 2-tuple.')

    slide_given = slide[0] is not None or slide[1] is not None
    time_given = time[0] is not None or time[1] is not None
    if slide_given and time_given:
        raise RuntimeError('Both slide and time bounds cannot be used simultaneously.')

    if slide_folder is None:
        slide_folder = os.path.join(os.getcwd(), 'slides')
    if not os.path.exists(slide_folder) or not os.path.isdir(slide_folder):
        raise ValueError(f'Given directory does not exist: {slide_folder}.')

    ffmpeg = []
    glob_start, glob_end = None, None
    if slide_given:
        lower_bound = -float('inf') if slide[0] is None else slide[0]
        upper_bound = float('inf') if slide[1] is None else slide[1]
        for i in range(len(slide_meta['slides']) - 1):
            i_ = i + 1
            if i_ >= lower_bound and i_ <= upper_bound:
                t_start = slide_meta['slides'][i]['time']
                t_end = slide_meta['slides'][i_]['time']
                t_duration = (t_end - t_start) / 1000
                f = _slide_exists(slide_meta['slides'][i]['image']['name'])
                ffmpeg += [f"file '{f}'", f'duration {t_duration:.3f}']

                glob_start = t_start / 1000 if glob_start is None else glob_start
                glob_end = t_end / 1000
        else:
            i_ = i + 2
            if i_ >= lower_bound and i_ <= upper_bound:
                f = _slide_exists(slide_meta['slides'][i + 1]['image']['name'])
                last_duration = 5 if last_duration is None else last_duration
                ffmpeg += [f"file '{f}'", f'duration {last_duration:.3f}']

                _glob = slide_meta['slides'][i + 1]['time']
                glob_start = _glob / 1000 if glob_start is None else glob_start
                glob_end = (_glob / 1000) + last_duration
    elif time_given:
        lower_bound = -float('inf') if time[0] is None else time[0]
        upper_bound = float('inf') if time[1] is None else time[1]
        for i in range(len(slide_meta['slides']) - 1):
            t_start = int(slide_meta['slides'][i]['time'] / 1000)  # inclusive
            t_end = int(slide_meta['slides'][i + 1]['time'] / 1000)  # exclusive

            if t_start >= lower_bound and t_end <= upper_bound:
                add_slide = True
                t_start_ = slide_meta['slides'][i]['time']
                t_end_ = slide_meta['slides'][i + 1]['time']
            elif (t_start < lower_bound and t_end > lower_bound
                      and t_end < upper_bound):
                add_slide = True
                t_start_ = lower_bound * 1000
                t_end_ = slide_meta['slides'][i + 1]['time']
            elif (t_start < upper_bound and t_end > upper_bound
                      and t_start >= lower_bound):
                add_slide = True
                t_start_ = slide_meta['slides'][i]['time']
                t_end_ = upper_bound * 1000
            else:
                add_slide = False
                t_start_ = None
                t_end_ = None

            if add_slide:
                f = _slide_exists(slide_meta['slides'][i]['image']['name'])
                t_duration = (t_end_ - t_start_) / 1000
                ffmpeg += [f"file '{f}'", f'duration {t_duration:.3f}']

                glob_start = t_start_ / 1000 if glob_start is None else glob_start
                glob_end = t_end_ / 1000
        else:  # handle the last slide
            t_start = int(slide_meta['slides'][i + 1]['time'] / 1000)  # inclusive
            t_end = None  # exclusive
            t_start_ = slide_meta['slides'][i + 1]['time'] / 1000
            if t_start >= lower_bound and t_start < upper_bound:
                f = _slide_exists(slide_meta['slides'][i + 1]['image']['name'])
                if upper_bound == float('inf'):
                    duration = 5 if last_duration is None else last_duration
                else:
                    if last_duration is None:
                        duration = upper_bound - t_start
                    else:
                        if t_start + last_duration < upper_bound:
                            duration = last_duration
                        else:
                            duration = upper_bound - t_start
                ffmpeg += [f"file '{f}'", f'duration {duration:.3f}']

                glob_start = t_start_ if glob_start is None else glob_start
                glob_end = t_start_ + duration
    else:
        for i in range(len(slide_meta['slides']) - 1):
            i_ = i + 1
            t_start = slide_meta['slides'][i]['time']
            t_end = slide_meta['slides'][i_]['time']
            t_duration = (t_end - t_start) / 1000

            f = _slide_exists(slide_meta['slides'][i]['image']['name'])
            ffmpeg += [f"file '{f}'", f'duration {t_duration:.3f}']

            glob_start = t_start / 1000 if glob_start is None else glob_start
        else:
            f = _slide_exists(slide_meta['slides'][i + 1]['image']['name'])
            last_duration = 5 if last_duration is None else last_duration
            ffmpeg += [f"file '{f}'", f'duration {last_duration:.3f}']

            glob_end = (slide_meta['slides'][i + 1]['time'] / 1000) + last_duration

    # NOTE: the last image must be duplicated without duration due to a bug
    #       in ffmpeg (https://trac.ffmpeg.org/wiki/Slideshow)
    if len(ffmpeg) > 1:
        ffmpeg.append(ffmpeg[-2])

    return '\n'.join(ffmpeg), glob_start, glob_end

# Cell
def compose_ffmpeg_video(ffmpeg_script, video_file=None):
    """
    Builds video slides from an ffmpeg script using the
    `ffmpeg -safe 0 -f concat -i ffmpeg_concat.txt -vsync vfr slides.mp4` command.
    """
    if video_file is None:
        video_file = 'slides.mp4'
    if not video_file.endswith('.mp4'):
        video_file += '.mp4'
    if os.path.exists(video_file):
        raise RuntimeError(f'{video_file} video file already exists.')

    with tempfile.NamedTemporaryFile(mode='w') as tf:
        tf.write(ffmpeg_script)
        tf.seek(0)

        # -pix_fmt yuv420p
        stream = os.popen(f'ffmpeg -safe 0 -f concat -i {tf.name} -vsync vfr {video_file}')
        print(stream.read())

# Cell
class SlidesLive():
    """
    Simplifies SlidesLive interaction.

    Should be initialised with SlidesLive presentation URL (`video_url`).
    Optionally, a destination folder for downloading slides may be specified
    (`slides_folder`).

    See `url2id`, `get_sl_info` and `get_slide_metadata` for more details.
    """
    def __init__(self, video_url, slides_folder=None):
        """Initialises SlidesLive."""
        self.slides_dir = slides_folder
        self.slides_video = None
        self.slide = None
        self.time = None

        self.start_time = None
        self.end_time = None

        self.video_id, self.video_name = url2id(video_url)
        self.video_description = get_sl_info(self.video_id)

        if 'slides_json_url' in self.video_description:
            meta = get_slide_metadata(
                self.video_description['slides_json_url'], approach='json')
        else:
            meta = get_slide_metadata(
                self.video_description['slides_xml_url'], approach='xml')
        self.video_metadata = meta

    def get_slide_urls(self, slide_type='big', slide=None, time=None):
        """Returns a list of slide URLs -- see `get_urls` for more details."""
        if self.slide is None and slide is None:
            self.slide = (None, None)
        elif self.slide is None and slide is not None:
            self.slide = slide
        elif self.slide is not None and slide is None:
            pass
        elif self.slide is not None and slide is not None:
            self.slide = slide

        if self.time is None and time is None:
            self.time = (None, None)
        elif self.time is None and time is not None:
            self.time = time
        elif self.time is not None and time is None:
            pass
        elif self.time is not None and time is not None:
            self.time = time

        return get_urls(self.video_id, self.video_metadata,
                        slide_type=slide_type,
                        slide=self.slide, time=self.time)

    def download_slides(self, slide_type='big', slide=None, time=None,
                        sleep_time=.2, jobs=16, directory=None, technique='python'):
        """Downloads a collection of slides -- see `get_urls` and `download_slide` for more details."""
        if directory is not None:
            self.slides_dir = directory
        elif self.slides_dir is None:
            self.slides_dir = self.video_id

        url_list = self.get_slide_urls(slide_type=slide_type,
                                       slide=slide, time=time)
        download_slides(url_list, sleep_time=sleep_time, jobs=jobs,
                       directory=self.slides_dir, technique=technique)

    def get_ffmpeg_script(self, slide_folder=None, last_duration=None,
                          slide=None, time=None):
        """Composes ffmpeg script -- see `ffmpeg_concat_script` for more details."""
        if slide_folder is not None:
            self.slides_dir = slide_folder
        elif self.slides_dir is None:
            self.slides_dir = self.video_id

        if self.slide is None and slide is None:
            self.slide = (None, None)
        elif self.slide is None and slide is not None:
            self.slide = slide
        elif self.slide is not None and slide is None:
            pass
        elif self.slide is not None and slide is not None:
            self.slide = slide

        if self.time is None and time is None:
            self.time = (None, None)
        elif self.time is None and time is not None:
            self.time = time
        elif self.time is not None and time is None:
            pass
        elif self.time is not None and time is not None:
            self.time = time

        return ffmpeg_concat_script(self.video_metadata, slide_folder=self.slides_dir,
                                    last_duration=last_duration, slide=self.slide, time=self.time)

    def compose_video(self, video_file=None,
                      slide_folder=None, last_duration=None,
                      slide=None, time=None):
        """Builds slides video -- see `ffmpeg_concat_script` and `compose_ffmpeg_video` for more details."""
        if video_file is not None:
            self.slides_video = video_file
        elif self.slides_dir is None and self.slides_video is None:
            self.slides_video = f'{self.video_id}.mp4'
        elif self.slides_dir is not None and self.slides_video is None:
            self.slides_video = f'{self.slides_dir}.mp4'

        if slide_folder is not None:
            self.slides_dir = slide_folder
        elif slide_folder is None and self.slides_dir is None:
            self.slides_dir = self.video_id

        ffmpeg_script, self.start_time, self.end_time = self.get_ffmpeg_script(
            slide_folder=self.slides_dir, last_duration=last_duration,
            slide=slide, time=time)
        compose_ffmpeg_video(ffmpeg_script, video_file=self.slides_video)

        print(f'\n\nExtracted time segment in seconds:\n    {self.start_time}--{self.end_time}')