Source code for eqcorrscan.core.match_filter.template

"""
Functions for network matched-filter detection of seismic data.

Designed to cross-correlate templates generated by template_gen function
with data and output the detections.

:copyright:
    EQcorrscan developers.

:license:
    GNU Lesser General Public License, Version 3
    (https://www.gnu.org/copyleft/lesser.html)
"""
import copy
import getpass
import os
import re
import shutil
import logging

from typing import List, Set
from functools import lru_cache

import numpy as np
from obspy import Stream
from obspy.core.event import Comment, Event, CreationInfo

from eqcorrscan.core.match_filter.helpers import _test_event_similarity
from eqcorrscan.core.match_filter.matched_filter import MatchFilterError
from eqcorrscan.core import template_gen

Logger = logging.getLogger(__name__)



[docs]
class Template(object):
    """
    Template object holder.

    Contains waveform data and metadata parameters used to generate the
    template.

    :type name: str
    :param name:
        A name associated with the template as a parsable string (no spaces
        allowed)
    :type st: `obspy.core.stream.Stream`
    :param st:
        The Stream object containing the Template waveform data
    :type lowcut: float
    :param lowcut:
        The low-cut filter used to achieve st (float, Hz)
    :type highcut: float
    :param highcut:
        The high-cut filter used to achieve st (float, Hz)
    :type samp_rate: float
    :param samp_rate:
        The Sampling-rate of the template (float, Hz) - note that this should
        be the same as the sampling-rate of all traces in st.
    :type filt_order: int
    :param filt_order:
        The order of the filter applied to achieve st (int), see pre-processing
        functions for a more complete description
    :type process_length: float
    :param process_length:
        The length of data (in seconds) processed to achieve st. e.g. if you
        processed a day of data then cut the template st from this, then
        process_length would be 86400.0 (float)
    :type prepick: float
    :param prepick:
        The time before picks that waveforms were cut to make st
        (seconds, float)
    :type event: `obspy.core.event.Event`
    :param event:
        The Event associated with the template (obspy.core.event.Event)
    """


[docs]
    def __init__(self, name=None, st=None, lowcut=None, highcut=None,
                 samp_rate=None, filt_order=None, process_length=None,
                 prepick=None, event=None):
        name_regex = re.compile(r"^[-a-z_0-9]+$")
        if name is not None and not re.match(name_regex, name):
            raise ValueError("Invalid name: '%s' - Must satisfy the regex "
                             "'%s'." % (name, name_regex.pattern))
        if name is None:
            temp_name = "unnamed"
        else:
            temp_name = name
        self.name = name
        self.st = st
        self.lowcut = lowcut
        self.highcut = highcut
        self.samp_rate = samp_rate
        if st and samp_rate is not None:
            for tr in st:
                if not tr.stats.sampling_rate == self.samp_rate:
                    raise MatchFilterError(
                        'Sampling rates do not match in data.')
        self.filt_order = filt_order
        self.process_length = process_length
        self.prepick = prepick
        if event is not None:
            if "eqcorrscan_template_" + temp_name not in \
                    [c.text for c in event.comments]:
                event.comments.append(Comment(
                    text="eqcorrscan_template_" + temp_name,
                    creation_info=CreationInfo(agency='eqcorrscan',
                                               author=getpass.getuser())))
        self.event = event


    @property
    def _processing_parameters(self):
        """
        Internal function / attribute to return all processing parameters for
        quick grouping of templates as tuple.
        """
        return (self.lowcut, self.highcut, self.samp_rate, self.filt_order,
                self.process_length)

    def __repr__(self):
        """
        Print the template.


        .. rubric:: Example

        >>> from obspy import read
        >>> print(Template())
        Template()
        >>> template = Template(
        ...     name='a', st=read(), lowcut=2.0, highcut=8.0, samp_rate=100,
        ...     filt_order=4, process_length=3600, prepick=0.5)
        >>> print(template) # doctest: +NORMALIZE_WHITESPACE
        Template a:
             3 channels;
             lowcut: 2.0 Hz;
             highcut: 8.0 Hz;
             sampling rate 100 Hz;
             filter order: 4;
             process length: 3600 s

        """
        if self.name is None:
            return 'Template()'
        if self.st is None:
            st_len = 0
        else:
            st_len = len(self.st)
        print_str = ('Template %s: \n\t %s channels;\n\t lowcut: %s Hz;'
                     '\n\t highcut: %s Hz;\n\t sampling rate %s Hz;'
                     '\n\t filter order: %s; \n\t process length: %s s'
                     % (self.name, st_len, self.lowcut, self.highcut,
                        self.samp_rate, self.filt_order, self.process_length))
        return print_str

    def __eq__(self, other, verbose=False, shallow_event_check=False):
        """
        Check for Template equality, rich comparison operator '=='

        .. rubric:: Example

        >>> from obspy import read
        >>> template_a = Template(
        ...     name='a', st=read(), lowcut=2.0, highcut=8.0, samp_rate=100,
        ...     filt_order=4, process_length=3600, prepick=0.5)
        >>> template_b = template_a.copy()
        >>> template_a == template_b
        True

        This should also cope with nan channels:
        >>> import numpy as np
        >>> template_c = template_a.copy()
        >>> template_c.st[0].data = np.full(
        ...    template_c.st[0].stats.npts, np.nan)
        >>> template_c == template_a
        False
        >>> template_d = template_c.copy()
        >>> template_d == template_c
        True


        This will check all parameters of template including the data in the
        stream.

        >>> template_b.st[0].data = template_b.st[0].data[0:-1]
        >>> template_a == template_b
        False

        >>> template_b = template_a.copy()
        >>> template_b.st[0].stats.station = 'MIS'
        >>> template_a == template_b
        False

        >>> template_b = template_a.copy()
        >>> template_b.st = template_b.st[0]
        >>> template_a == template_b
        False

        >>> template_b = template_a.copy()
        >>> template_b.lowcut = 5.0
        >>> template_a == template_b
        False


        This will also test if the events in the templates are the same,
        ignoring resource ID's as obspy will alter these so that they do not
        match.

        >>> from eqcorrscan.core.match_filter.party import Party
        >>> party = Party().read()
        >>> template_a = party.families[0].template
        >>> template_b = template_a.copy()
        >>> template_b.event.origins[0].time = \
            template_a.event.origins[0].time + 20
        >>> template_a == template_b
        False
        """
        for key in self.__dict__.keys():
            if key == 'st':
                self_is_stream = isinstance(self.st, Stream)
                other_is_stream = isinstance(other.st, Stream)
                if self_is_stream and other_is_stream:
                    for tr, oth_tr in zip(self.st.sort(),
                                          other.st.sort()):
                        if not np.array_equal(
                                tr.data, oth_tr.data, equal_nan=True):
                            if verbose:
                                print("Template data are not equal on "
                                      "{0}".format(tr.id))
                            return False
                        for trkey in ['network', 'station', 'channel',
                                      'location', 'starttime', 'endtime',
                                      'sampling_rate', 'delta', 'npts',
                                      'calib']:
                            if tr.stats[trkey] != oth_tr.stats[trkey]:
                                if verbose:
                                    print("{0} does not match in "
                                          "template".format(trkey))
                                return False
                elif self_is_stream and not other_is_stream:
                    return False
                elif not self_is_stream and other_is_stream:
                    return False
            elif key == 'event':
                self_is_event = isinstance(self.event, Event)
                other_is_event = isinstance(other.event, Event)
                if self_is_event and other_is_event:
                    if not _test_event_similarity(
                            self.event, other.event, verbose=verbose,
                            shallow=shallow_event_check):
                        return False
                elif self_is_event and not other_is_event:
                    return False
                elif not self_is_event and other_is_event:
                    return False
            elif not self.__dict__[key] == other.__dict__[key]:
                return False
        return True

    def __ne__(self, other):
        """
        Rich comparison operator '!='.

        .. rubric:: Example

        >>> from obspy import read
        >>> template_a = Template(
        ...     name='a', st=read(), lowcut=2.0, highcut=8.0, samp_rate=100,
        ...     filt_order=4, process_length=3600, prepick=0.5)
        >>> template_b = template_a.copy()
        >>> template_a != template_b
        False
        """
        return not self.__eq__(other)


[docs]
    def copy(self):
        """
        Returns a copy of the template.

        :return: Copy of template

        .. rubric:: Example

        >>> from obspy import read
        >>> template_a = Template(
        ...     name='a', st=read(), lowcut=2.0, highcut=8.0, samp_rate=100,
        ...     filt_order=4, process_length=3600, prepick=0.5)
        >>> template_b = template_a.copy()
        >>> template_a == template_b
        True
        """
        return copy.deepcopy(self)



[docs]
    def same_processing(self, other):
        """
        Check is the templates are processed the same.

        .. rubric:: Example

        >>> from obspy import read
        >>> template_a = Template(
        ...     name='a', st=read(), lowcut=2.0, highcut=8.0, samp_rate=100,
        ...     filt_order=4, process_length=3600, prepick=0.5)
        >>> template_b = template_a.copy()
        >>> template_a.same_processing(template_b)
        True
        >>> template_b.lowcut = 5.0
        >>> template_a.same_processing(template_b)
        False
        """
        if self._processing_parameters == other._processing_parameters:
            return True
        return False



[docs]
    def write(self, filename, format='tar'):
        """
        Write template.

        :type filename: str
        :param filename:
            Filename to write to, if it already exists it will be opened and
            appended to, otherwise it will be created.
        :type format: str
        :param format:
            Format to write to, either 'tar' (to retain metadata), or any obspy
            supported waveform format to just extract the waveform.

        .. rubric:: Example

        >>> from obspy import read
        >>> template_a = Template(
        ...     name='a', st=read(), lowcut=2.0, highcut=8.0, samp_rate=100,
        ...     filt_order=4, process_length=3600, prepick=0.5)
        >>> template_a.write('test_template') # doctest: +NORMALIZE_WHITESPACE
        Template a:
         3 channels;
         lowcut: 2.0 Hz;
         highcut: 8.0 Hz;
         sampling rate 100 Hz;
         filter order: 4;
         process length: 3600 s
        >>> template_a.write('test_waveform.ms',
        ...                  format='MSEED') # doctest: +NORMALIZE_WHITESPACE
        Template a:
         3 channels;
         lowcut: 2.0 Hz;
         highcut: 8.0 Hz;
         sampling rate 100 Hz;
         filter order: 4;
         process length: 3600 s
        """
        from eqcorrscan.core.match_filter.tribe import Tribe

        if format == 'tar':
            Tribe(templates=[self]).write(filename=filename)
        else:
            self.st.write(filename, format=format)
        return self



[docs]
    def read(self, filename):
        """
        Read template from tar format with metadata.

        :type filename: str
        :param filename: Filename to read template from.

        .. rubric:: Example

        >>> from obspy import read
        >>> template_a = Template(
        ...     name='a', st=read(), lowcut=2.0, highcut=8.0, samp_rate=100,
        ...     filt_order=4, process_length=3600, prepick=0.5)
        >>> template_a.write(
        ...     'test_template_read') # doctest: +NORMALIZE_WHITESPACE
        Template a:
         3 channels;
         lowcut: 2.0 Hz;
         highcut: 8.0 Hz;
         sampling rate 100 Hz;
         filter order: 4;
         process length: 3600 s
        >>> template_b = Template().read('test_template_read.tgz')
        >>> template_a == template_b
        True
        """
        from eqcorrscan.core.match_filter.tribe import Tribe

        tribe = Tribe()
        tribe.read(filename=filename)
        if len(tribe) > 1:
            raise IOError('Multiple templates in file')
        for key in self.__dict__.keys():
            self.__dict__[key] = tribe[0].__dict__[key]
        return self



[docs]
    def detect(self, stream, threshold, threshold_type, trig_int,
               plot=False, plotdir=None, pre_processed=False, daylong=False,
               parallel_process=True, xcorr_func=None, concurrency=None,
               cores=None, ignore_length=False, overlap="calculate",
               full_peaks=False, **kwargs):
        """
        Detect using a single template within a continuous stream.

        :type stream: `obspy.core.stream.Stream`
        :param stream: Continuous data to detect within using the Template.
        :type threshold: float
        :param threshold:
            Threshold level, if using `threshold_type='MAD'` then this will be
            the multiple of the median absolute deviation.
        :type threshold_type: str
        :param threshold_type:
            The type of threshold to be used, can be MAD, absolute or
            av_chan_corr.  See Note on thresholding below.
        :type trig_int: float
        :param trig_int:
            Minimum gap between detections in seconds. If multiple detections
            occur within trig_int of one-another, the one with the highest
            cross-correlation sum will be selected.
        :type plot: bool
        :param plot: Turn plotting on or off.
        :type plotdir: str
        :param plotdir:
            The path to save plots to. If `plotdir=None` (default) then the
            figure will be shown on screen.
        :type pre_processed: bool
        :param pre_processed:
            Set to True if `stream` has already undergone processing, in this
            case eqcorrscan will only check that the sampling rate is correct.
            Defaults to False, which will use the
            :mod:`eqcorrscan.utils.pre_processing` routines to resample and
            filter the continuous data.
        :type daylong: bool
        :param daylong:
            Set to True to use the
            :func:`eqcorrscan.utils.pre_processing.dayproc` routine, which
            preforms additional checks and is more efficient for day-long data
            over other methods.
        :type parallel_process: bool
        :param parallel_process:
        :type xcorr_func: str or callable
        :param xcorr_func:
            A str of a registered xcorr function or a callable for implementing
            a custom xcorr function. For more details see
            :func:`eqcorrscan.utils.correlate.register_array_xcorr`.
        :type concurrency: str
        :param concurrency:
            The type of concurrency to apply to the xcorr function. Options are
            'multithread', 'multiprocess', 'concurrent'. For more details see
            :func:`eqcorrscan.utils.correlate.get_stream_xcorr`.
        :type cores: int
        :param cores: Number of workers for processing and detection.
        :type ignore_length: bool
        :param ignore_length:
            If using daylong=True, then dayproc will try check that the data
            are there for at least 80% of the day, if you don't want this check
            (which will raise an error if too much data are missing) then set
            ignore_length=True.  This is not recommended!
        :type overlap: float
        :param overlap:
            Either None, "calculate" or a float of number of seconds to
            overlap detection streams by.  This is to counter the effects of
            the delay-and-stack in calculating cross-correlation sums. Setting
            overlap = "calculate" will work out the appropriate overlap based
            on the maximum lags within templates.
        :type full_peaks: bool
        :param full_peaks:
            See :func:`eqcorrscan.utils.findpeaks.find_peaks2_short`

        :returns: Family of detections.


        .. Note::
            When using the "fftw" correlation backend the length of the fft
            can be set. See :mod:`eqcorrscan.utils.correlate` for more info.

        .. Note::
            `stream` must not be pre-processed. If your data contain gaps
            you should *NOT* fill those gaps before using this method.
            The pre-process functions (called within) will fill the gaps
            internally prior to processing, process the data, then re-fill
            the gaps with zeros to ensure correlations are not incorrectly
            calculated within gaps. If your data have gaps you should pass a
            merged stream without the `fill_value` argument
            (e.g.: `stream = stream.merge()`).

        .. note::
            **Data overlap:**

            Internally this routine shifts and trims the data according to
            the offsets in the template (e.g. if trace 2 starts 2 seconds
            after trace 1 in the template then the continuous data will be
            shifted by 2 seconds to align peak correlations prior to summing).
            Because of this, detections at the start and end of continuous data
            streams **may be missed**.  The maximum time-period that might be
            missing detections is the maximum offset in the template.

            To work around this, if you are conducting matched-filter
            detections through long-duration continuous data, we suggest using
            some overlap (a few seconds, on the order of the maximum offset
            in the templates) in the continous data.  You will then need to
            post-process the detections (which should be done anyway to remove
            duplicates).

        .. note::
            **Thresholding:**

            **MAD** threshold is calculated as the:

            .. math::

                threshold {\\times} (median(abs(cccsum)))

            where :math:`cccsum` is the cross-correlation sum for a
            given template.

            **absolute** threshold is a true absolute threshold based on the
            cccsum value.

            **av_chan_corr** is based on the mean values of single-channel
            cross-correlations assuming all data are present as required
            for the template, e.g:

            .. math::

                av\_chan\_corr\_thresh=threshold \\times (cccsum /
                len(template))

            where :math:`template` is a single template from the input and the
            length is the number of channels within this template.

        .. Note::
            See tutorials for example.
        """
        from eqcorrscan.core.match_filter.tribe import Tribe
        if kwargs.get("plotvar") is not None:
            Logger.warning("plotvar is depreciated, use plot instead")
            plot = kwargs.get("plotvar")
        party = Tribe(templates=[self]).detect(
            stream=stream, threshold=threshold,
            threshold_type=threshold_type, trig_int=trig_int, plotdir=plotdir,
            plot=plot, pre_processed=pre_processed, daylong=daylong,
            parallel_process=parallel_process, xcorr_func=xcorr_func,
            concurrency=concurrency, cores=cores, ignore_length=ignore_length,
            overlap=overlap, full_peaks=full_peaks, **kwargs)
        family = party[0]
        # Remove duplicates
        family.detections = family._uniq().detections
        return family



[docs]
    def construct(self, method, name, lowcut, highcut, samp_rate, filt_order,
                  length, prepick, swin="all", process_len=86400,
                  all_horiz=False, delayed=True, plot=False, plotdir=None,
                  min_snr=None, parallel=False, num_cores=False,
                  skip_short_chans=False, **kwargs):
        """
        Construct a template using a given method.

        :param method:
            Method to make the template, the only available method is:
            `from_sac`. For all other methods (`from_client` and
            `from_meta_file`) use `Tribe.construct()`.
        :type method: str
        :type name: str
        :param name: Name for the template
        :type lowcut: float
        :param lowcut:
            Low cut (Hz), if set to None will not apply a lowcut
        :type highcut: float
        :param highcut:
            High cut (Hz), if set to None will not apply a highcut.
        :type samp_rate: float
        :param samp_rate:
            New sampling rate in Hz.
        :type filt_order: int
        :param filt_order:
            Filter level (number of corners).
        :type length: float
        :param length: Length of template waveform in seconds.
        :type prepick: float
        :param prepick: Pre-pick time in seconds
        :type swin: str
        :param swin:
            P, S, P_all, S_all or all, defaults to all: see note in
            :func:`eqcorrscan.core.template_gen.template_gen`
        :type process_len: int
        :param process_len: Length of data in seconds to download and process.
        :type all_horiz: bool
        :param all_horiz:
            To use both horizontal channels even if there is only a pick on
            one of them.  Defaults to False.
        :type delayed: bool
        :param delayed: If True, each channel will begin relative to it's own
            pick-time, if set to False, each channel will begin at the same
            time.
        :type plot: bool
        :param plot: Plot templates or not.
        :type plotdir: str
        :param plotdir:
            The path to save plots to. If `plotdir=None` (default) then the
            figure will be shown on screen.
        :type min_snr: float
        :param min_snr:
            Minimum signal-to-noise ratio for a channel to be included in the
            template, where signal-to-noise ratio is calculated as the ratio
            of the maximum amplitude in the template window to the rms
            amplitude in the whole window given.
        :type parallel: bool
        :param parallel: Whether to process data in parallel or not.
        :type num_cores: int
        :param num_cores:
            Number of cores to try and use, if False and parallel=True,
            will use either all your cores, or as many traces as in the data
            (whichever is smaller).
        :type skip_short_chans: bool
        :param skip_short_chans:
            Whether to ignore channels that have insufficient length data or
            not. Useful when the quality of data is not known, e.g. when
            downloading old, possibly triggered data from a datacentre

        .. note::

            `method=from_sac` requires the following kwarg(s):
            :param list sac_files:
                osbpy.core.stream.Stream of sac waveforms, or list of paths to
                sac waveforms.
            .. note::
                See `eqcorrscan.utils.sac_util.sactoevent` for details on
                how pick information is collected.

        .. rubric:: Example

        >>> # Get the path to the test data
        >>> import eqcorrscan
        >>> import os, glob
        >>> TEST_PATH = (
        ...     os.path.dirname(eqcorrscan.__file__) + '/tests/test_data')
        >>> sac_files = glob.glob(TEST_PATH + '/SAC/2014p611252/*')
        >>> template = Template().construct(
        ...     method='from_sac', name='test', lowcut=2.0, highcut=8.0,
        ...     samp_rate=20.0, filt_order=4, prepick=0.1, swin='all',
        ...     length=2.0, sac_files=sac_files)
        >>> print(template) # doctest: +NORMALIZE_WHITESPACE
        Template test:
         12 channels;
         lowcut: 2.0 Hz;
         highcut: 8.0 Hz;
         sampling rate 20.0 Hz;
         filter order: 4;
         process length: 300.0 s


        This will raise an error if the method is unsupported:

        >>> template = Template().construct(
        ...     method='from_meta_file', name='test', lowcut=2.0, highcut=8.0,
        ...     samp_rate=20.0, filt_order=4, prepick=0.1, swin='all',
        ...     length=2.0) # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
        NotImplementedError: Method is not supported, use \
        Tribe.construct instead.

        """
        if method in ['from_meta_file', 'from_client', 'multi_template_gen']:
            raise NotImplementedError('Method is not supported, '
                                      'use Tribe.construct instead.')
        streams, events, process_lengths = template_gen.template_gen(
            method=method, lowcut=lowcut, highcut=highcut, length=length,
            filt_order=filt_order, samp_rate=samp_rate, prepick=prepick,
            return_event=True, swin=swin, process_len=process_len,
            all_horiz=all_horiz, delayed=delayed, plot=plot, plotdir=plotdir,
            min_snr=min_snr, parallel=parallel, num_cores=num_cores,
            skip_short_chans=skip_short_chans, **kwargs)
        self.name = name
        st = streams[0]
        event = events[0]
        process_length = process_lengths[0]
        for tr in st:
            if not np.any(tr.data.astype(np.float16)):
                Logger.warning('Data are zero in float16, missing data,'
                               ' will not use: {0}'.format(tr.id))
                st.remove(tr)
        self.st = st
        self.lowcut = lowcut
        self.highcut = highcut
        self.filt_order = filt_order
        self.samp_rate = samp_rate
        self.process_length = process_length
        self.prepick = prepick
        self.event = event
        return self





[docs]
def read_template(fname):
    """
    Read a Template object from a tar archive.

    :type fname: str
    :param fname: Filename to read from

    :return: :class:`eqcorrscan.core.match_filter.Template`
    """
    template = Template()
    template.read(filename=fname)
    return template



def group_templates(templates):
    """
    Group templates into sets of similarly processed templates.

    :type templates: List of Tribe of Templates
    :return: List of Lists of Templates.
    """
    template_groups = []
    for master in templates:
        for group in template_groups:
            if master in group:
                break
        else:
            new_group = [master]
            for slave in templates:
                if master.same_processing(slave) and master != slave:
                    new_group.append(slave)
            template_groups.append(new_group)
    # template_groups will contain an empty first list
    for group in template_groups:
        if len(group) == 0:
            template_groups.remove(group)
    return template_groups


def quick_group_templates(templates):
    """
    Group templates into sets of similarly processed templates.

    :type templates: List of Tribe of Templates
    :return: List of Lists of Templates.
    """
    # Get the template's processing parameters
    processing_tuples = [template._processing_parameters
                         for template in templates]
    # Get list of unique parameter-tuples. Sort it so that the order in which
    # the groups are processed is consistent across different runs.
    uniq_processing_parameters = sorted(list(set(processing_tuples)))
    # sort templates into groups
    template_groups = []
    for parameter_combination in uniq_processing_parameters:
        # find indices of tuples in list with same parameters
        template_indices_for_group = [
            j for j, param_tuple in enumerate(processing_tuples)
            if param_tuple == parameter_combination]

        new_group = list()
        for template_index in template_indices_for_group:
            # use indices to sort templates into groups
            new_group.append(templates[int(template_index)])
        template_groups.append(new_group)
    return template_groups


def group_templates_by_seedid(
        templates: List[Template],
        st_seed_ids: Set[str],
        group_size: int,
) -> List[List[Template]]:
    """
    Group templates to reduce dissimilar traces

    :param templates:
        Templates to group together
    :param st_seed_ids:
        Seed ids in the stream to be matched with
    :param group_size:
        Maximum group size - will not exceed this size

    :return:
        List of lists of templates grouped.
    """
    all_template_sids = {tr.id for template in templates for tr in template.st}
    if len(all_template_sids.intersection(st_seed_ids)) == 0:
        Logger.warning(f"No matches between stream ({st_seed_ids} and "
                       f"templates ({all_template_sids}")
    # Get overlapping seed ids so that we only group based on the
    # channels we have in the data. Use a tuple so that it hashes
    template_seed_ids = tuple(
        (template.name, tuple(
            {tr.id for tr in template.st}.intersection(st_seed_ids)))
        for template in templates)
    # Don't use templates that don't have any overlap with the stream
    template_seed_ids = tuple(
        (t_name, t_chans) for t_name, t_chans in template_seed_ids
        if len(t_chans))
    Logger.info(f"Dropping {len(templates) - len(template_seed_ids)} "
                f"templates due to no matched channels")
    # We will need this dictionary at the end for getting the templates by id
    template_dict = {t.name: t for t in templates}
    # group_size can be None, in which case we don't actually need to group
    if (group_size is not None) and (group_size < len(template_seed_ids)):
        # Pass off to cached function
        out_groups = _group_seed_ids(
            template_seed_ids=template_seed_ids, group_size=group_size)

        # Convert from groups of template names to groups of templates
        out_groups = [[template_dict[t] for t in out_group]
                      for out_group in out_groups]
    else:
        Logger.info(f"Group size ({group_size}) larger than n templates"
                    f" ({len(template_seed_ids)}), no grouping performed")
        out_groups = [[template_dict[t[0]] for t in template_seed_ids]]
    assert sum(len(grp) for grp in out_groups) == len(template_seed_ids), (
        "Something went wrong internally with grouping - we don't have the "
        "right number of templates. Please report this bug")
    return out_groups


@lru_cache(maxsize=2)
def _group_seed_ids(template_seed_ids, group_size):
    """ Cachable version of internals to avoid re-computing for every day. """
    # Convert hashable tuple to dict for ease
    template_seed_ids = {tup[0]: set(tup[1]) for tup in template_seed_ids}
    # Get initial groups with matched traces - sort by length
    sorted_templates = sorted(
        template_seed_ids, key=lambda key: len(template_seed_ids[key]),
        reverse=True)
    # Group into matching sets of seed-ids - ideally all groups would have the
    # same seed ids and no nan-traces
    groups, group, group_sids, group_sid = (
        [], [sorted_templates[0]], [], template_seed_ids[sorted_templates[0]])

    for i in range(1, len(sorted_templates)):
        # Check that we don't exceed the maximum group size first
        if len(group) >= group_size or (
                template_seed_ids[sorted_templates[i]] != group_sid):
            groups.append(group)
            group = [sorted_templates[i]]
            group_sids.append(group_sid)
            group_sid = template_seed_ids[sorted_templates[i]]
        else:
            group.append(sorted_templates[i])
    # Get the final group
    groups.append(group)
    group_sids.append(group_sid)
    Logger.info(f"{len(groups)} initial template groups")

    # Check if all the groups are full
    groups_full = sum([len(grp) == group_size for grp in groups])
    if groups_full >= (len(template_seed_ids) // group_size) - 1:
        return groups

    # Smush together groups until group-size condition is met.
    # Make list of group ids - compare ungrouped sids to intersection of sids
    # in group - add most similar, then repeat. Start with fewest sids.

    n_original_groups = len(groups)
    grouped = np.zeros(n_original_groups, dtype=bool)
    # Use -1 as no group given
    group_ids = np.ones(n_original_groups, dtype=int) * -1

    # Sort groups by length of seed-ids
    order = np.argsort([len(sid) for sid in group_sids])
    groups = [groups[i] for i in order]
    group_sids = [group_sids[i] for i in order]

    # Assign shortest group to the zeroth group
    grouped[0], group_id = 1, 0
    group_ids[0] = group_id
    group_sid = group_sids[0]
    group_len = len(groups[0])
    # Loop until all groups have been assigned a final group
    Logger.info("Running similarity grouping")
    while grouped.sum() < n_original_groups:
        # Work out difference between groups
        diffs = [(i, len(group_sid.symmetric_difference(other_sid)))
                 for i, other_sid in enumerate(group_sids) if not grouped[i]]
        diffs.sort(key=lambda tup: tup[1])
        closest_group_id = diffs[0][0]

        if group_len + len(groups[closest_group_id]) > group_size:
            # Max size reached, make new group
            group_id += 1
            # Take the next shortest ungrouped group
            i = 0
            while grouped[i]:
                i += 1
            group_sid = group_sids[i]
            grouped[i] = 1
            group_ids[i] = group_id
            group_len = len(groups[i])
        else:
            # Add in closest
            grouped[closest_group_id] = 1
            group_ids[closest_group_id] = group_id
            # Update the group seed ids to include the new ones
            group_sid = group_sid.union(group_sids[closest_group_id])
            group_len += len(groups[closest_group_id])
    Logger.info("Completed grouping")

    out_groups = []
    for group_id in set(group_ids):
        out_group = [t for i in range(n_original_groups)
                     if group_ids[i] == group_id for t in groups[i]]
        out_groups.append(out_group)
    return out_groups


if __name__ == "__main__":
    import doctest

    doctest.testmod()
    # List files to be removed after doctest
    cleanup = ['test_template.tgz', 'test_waveform.ms']
    for f in cleanup:
        if os.path.isfile(f):
            os.remove(f)
        elif os.path.isdir(f):
            shutil.rmtree(f)