Source code for origli.utilities.utilities

#!/usr/bin/env python

"""

 Script name: utilities.py

 Description:
    File containing utilities




"""
try:
    import cv2
except:
    pass
import json
import numpy as np 
import pandas as pd
import os
import matplotlib
from sklearn import mixture
from sklearn.decomposition import PCA
import h5py
import time
import random as rd
import xml.etree.ElementTree as ET # for XML file
import sys
from math import floor
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
import concurrent.futures
import itertools as itrtls
from math import factorial
from scipy.stats import ttest_ind, binom, binom_test, chi2, beta
from sklearn.utils import shuffle as sklearn_shuffle
import scipy.integrate as integrate




try:
   from gwpy.timeseries import TimeSeries, TimeSeriesDict
   from gwpy.segments import DataQualityFlag
   from gwpy.table import EventTable
   from gwpy.table.io.pycbc import filter_empty_files as filter_pycbc_live_files
   from gwpy.time import tconvert
except ImportError:
   print('There is not gwpy module yet')
try:
    from gwtrigfind import find_trigger_files
except ModuleNotFoundError:
    raise ValueError("gwtrigfind has not been installed yet. please do pip install gwtrigfind")

matplotlib.use("Agg")
# matplotlib.rcParams["backend"] = "TkAgg"
# Test display
# try:
#     matplotlib.rcParams["backend"] = "TkAgg"
#     m_dis = os.environ["DISPLAY"]
# except KeyError:
#     print("Display problems. Set to Agg")
#     matplotlib.use('Agg')
#
import matplotlib.pyplot as plt
plt.rc('text', usetex=False) # this is used for fixing a gwpy compatible issue
from matplotlib.colors import LogNorm
from matplotlib.ticker import MaxNLocator

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

import origli.utilities.const as Const # for multi frequency bands defined in


__author__ = "Kentaro Mogushi"
__copyright__ = "Copyright 2019-2025, Kentaro Mogushi"
__credits__ = ["Line for credits"]
__license__ = "GPL"
__version__ = "0.1.0"
__maintainer__ = "K. Mogushi"
__email__ = "kmhhz@umsystem.edu"
__status__ = "Developing"


# def convert_imageset_to_pandas(m_indexfile):
#     """
#
#     :param m_indexfile:
#     :return:
#     """
#
#     df_in = pd.read_csv(m_indexfile)
#     label_list=[]
#     image_list=None
#
#     for ii in range(len(df_in)):
#         m_img_array,h,w,d = image_to_1d(df_in.iloc[ii]["id"])
#
#         m_img_array = np.insert(m_img_array,0,d,axis=1)
#         m_img_array = np.insert(m_img_array,0,w,axis=1)
#         m_img_array = np.insert(m_img_array,0,h,axis=1)
#
#         try:
#             image_list = np.concatenate((image_list,m_img_array),axis=0)
#         except ValueError:
#             image_list = m_img_array
#         label_list.append(df_in.loc[ii]["label"])
#
#
#     label_list = np.asarray(label_list)
#
#     #build the labeling
#     col_names =["h","w","d"]
#     for hi in range(h):
#         for wi in range(w):
#             for di in range(d):
#                 col_names.append("px-h%s-w%s-d%s" % (hi,wi,di))
#
#
#     image_df = pd.DataFrame(data=image_list,columns=col_names)
#     label_df = pd.DataFrame(data=label_list,columns=["label"])
#
#     out_df = label_df.join(image_df)
#
#     if len(out_df)!=len(df_in):
#         raise RuntimeError("Input and output numer of images does not match!")
#
#     #if m_output_name is not None:
#     #    out_df.to_csv(m_output_name)#,columns=["label",col_names])
#     #    out_df.to_hdf(m_output_name,key="df", mode='w', complevel=9, complib="bzip2")
#     return out_df
#
# def image_to_1d(m_input_image_filename):
#     """
#
#     :param m_input_image:
#     :return:
#     """
#
#     m_input_image = cv2.imread(m_input_image_filename)
#
#     h = m_input_image.shape[0]
#     w = m_input_image.shape[1]
#     d = m_input_image.shape[2]
#     cv2.destroyAllWindows()
#
#     return m_input_image.reshape(1,h*w*d),h,w,d
#
# def read_json_file(m_filename):
#     m_out_dict = json.loads(open(m_filename).read())
#
#     return m_out_dict
#
#
#


###################################################
# grab the GPS times of a glitch for a certain ifo
###################################################

[docs]def GrabGPStimesSafechannel(fileid, ifo, SNRthre, glitch, PathSafeChannel, Epochstart, Epochend, Commissioning_lt=None):
    '''
    description:
    This imports a file containing the output of GravitySpy
        This take the GPS times during O2 run
        and take the list of the safe channels and modify it such as it works in gwpy
    USAGE: GPSs, ids ,re_sfchs = GrabGPStimesSafechannel('/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/GravityspyTrainingset/gspy-db-20180813.csv', 'L1', 7, 'Blip', '/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/ListSaveChannel/L1/O2_omicron_channel_list_hvetosafe_GDS.txt')
    :param fileid: a file that contains all the meta data of glitches used for training set for GravitySpy

    :param ifo: a kind of interferometer {L1, H1, V1}
    :param SNRthre: the minimum threshold of SNR, e.g, 7
    :param glitch: a kind of glitch
    :param PathSafeChannel: the full path of the file of the metadata
    :param Epochstart: GPS time when a science run begins, float or int
    :param Epochend: GPS time when a science run ends, float or int
    :param Commissioning_lt: the set of commissioning times in a list of lists, e.g., [[Cstart1, Cend1], [Cstart2, Cend2]], None in defalt
    :return
            GPSs: a list of GPS timse
            ids: a list of unique id
            re_sfchs: a list of safe channels

        '''
    # import the data file in pandas format
    df = pd.read_csv(fileid)
    # data set of a gitch 'label' seen ifo with SNR is greater than SNRthre
    df_int = df[(df['ifo']==ifo) & (df['label']==glitch) & (df['snr']>=SNRthre)]
    # restrict the data within O2 run
    # within the beginning and end of the run
    df_int = df_int[(df_int['GPStime'] > Epochstart) & (df_int['GPStime'] < Epochend)]
    # removing the time of commissioning during the science run
    if Commissioning_lt !=None: # if there is (are) commissioning
        for Commissioning in Commissioning_lt:
            Commissioning_start, Commissioning_end = Commissioning[0], Commissioning[1]
            df_int = df_int[(df_int['GPStime'] < Commissioning_start) | (df_int['GPStime'] > Commissioning_end)]
    # the list of GPS times
    GPSs = np.array(df_int['GPStime'].tolist())
    # the list of uniqu ID
    ids = np.array(df_int['id'].tolist())
    # take the list of the safe channel
    sfchs = pd.read_csv(PathSafeChannel, names=['channel'])
    # revise the name of the safe channels to mathc gwpy
    # fist convert the list of the same channels to the numpy format for the sake of convenience
    sfchs = np.array(sfchs['channel'])
    # make the empty list for the renamed channels
    re_sfchs = []
    for i in sfchs:
        # split it two parts before and after '_' appearing the first time from the left
        splited_name = i.split('_', 1)
        # reconstructed name
        re_name = '%s-%s' % (splited_name[0], splited_name[1])
        # fill the name
        re_sfchs.append(re_name)
    re_sfchs = np.array(re_sfchs)
    # retrun the list of GPS times, the ids, the name of the safe channels in numpy array format
    return GPSs, ids, re_sfchs


###################################################
# grab the list of the safe channels in IFO
###################################################

[docs]def GrabSafechannel(PathSafeChannel):
    '''
    description:
        take the list of the safe channels and modify it such as it works in gwpy
    USAGE: re_sfchs = GrabGPStimesSafechannel('/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/ListSaveChannel/L1/O2_omicron_channel_list_hvetosafe_GDS.txt')

    :param :PathSafeChannel: the full path of the file of the metadata
    :return
        re_sfchs: a list of safe channels

    '''

    # take the list of the safe channel
    sfchs = pd.read_csv(PathSafeChannel, names=['channel'])
    # revise the name of the safe channels to mathc gwpy
    # fist convert the list of the same channels to the numpy format for the sake of convenience
    sfchs = np.array(sfchs['channel'])
    # sort the list in alphabetical order
    sfchs.sort()
    # make the empty list for the renamed channels
    re_sfchs = []
    for i in sfchs:
        # split it two parts before and after '_' appearing the first time from the left
        splited_name = i.split('_', 1)
        # reconstructed name
        re_name = '%s-%s' % (splited_name[0], splited_name[1])
        # fill the name
        re_sfchs.append(re_name)
    re_sfchs = np.array(re_sfchs)
    # retrun the list of GPS times, the ids, the name of the safe channels in numpy array format
    return re_sfchs






###################################################
# remove the channels that are not used in O2
####################################################

[docs]def RemoveChannelUnused(re_sfchs, PathListChannelUnused):
    '''
    Description:
        K.M find that some of the channels in the list of the safe channels are not used in O2
        so that gwpy can not get the time series of those channels.
        This function remove those channels.
    USAGE:
        re_sfchs = RemoveChannelUnused(re_sfchs, '/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/ListSaveChannel/L1/O2_omicron_channel_list_hvetosafe_GDS.txt')

    :param re_sfchs: the list the safe channels in the numpy array format
    :return re_sfchs:  the list the safe channels without unused channels in the numpy array format

        '''
    # remove GDS-CALIB_STRAIN as it is not the safe channel
    re_sfchs = re_sfchs[re_sfchs!='GDS-CALIB_STRAIN']
    UnusedChannels = np.loadtxt(PathListChannelUnused, dtype='str')
    if UnusedChannels.shape[0] != 0:
        for unusedChannel in UnusedChannels:
            re_sfchs = re_sfchs[re_sfchs!=unusedChannel]
    else:
        pass
    ##########  remove noisemon channels #############
    for ChannelName in re_sfchs:
        try:
            if ChannelName.split('-')[1].split('_')[2] == 'NOISEMON':
                re_sfchs = re_sfchs[re_sfchs!=ChannelName]
        except IndexError:
            pass
    ######################
    return re_sfchs


[docs]def ListUsedSafeChannel(path_list_channel, ifo):
    '''
    description:
        1. take a path to a .csv file that has lists of channels
        2. remove unused safe channel from a list of safe channels

    USAGE: sfchs = ListUsedSafeChannel(path_list_channel, ifo)

    :param path_list_channel: a path to a list of channels (.csv file)
    :param ifo: observatory {L1, H1} in str
    :return: sfchs: subst of safe channels in numpy array
    '''
    ListChannel_df = pd.read_csv(path_list_channel, names=['H1ListSafeChannel', 'H1ListUnusedChannel', 'L1ListSafeChannel','L1UnusedChannel'])
    sfchs = np.array(ListChannel_df['{}ListSafeChannel'.format(ifo)].tolist())
    UnusedChannels = np.array(ListChannel_df['{}ListUnusedChannel'.format(ifo)].tolist())
    for unusedChannel in UnusedChannels:
        sfchs = sfchs[sfchs != unusedChannel]
    return sfchs



#
# #######################################
# # take the subset of the channels
# #######################################
#
# def TakeSubsetList(list, subset_num, cor_list=[None]):
#     '''
#
#     :param list: an original list
#     :param subset_num: the size of the output list which is randomly chosen a subset of the original list
#     :param cor_list: a corresponding list if it is related to the original list (e.g., IDs of the list GPSs given by GravitySpy)
#     :return: a reduced list, a corresponding list
#     '''
#     print(cor_list[0])
#     if cor_list[0] == None:
#         if len(list) > subset_num: # if the size of an original list is greater than the size of the subset
#             list_subset = np.array(rd.sample(list, subset_num))
#         else: # if not
#             list_subset = list
#         return list_subset
#     else:
#         if len(list) > subset_num: # if the size of an original list is greater than the size of the subset
#             list_subset = np.array(rd.sample(list, subset_num))
#         else: # if not
#             list_subset = list
#         # the index having the elements of the subset in list
#         ind_subset = [np.where(list==i)[0][0] for i in list_subset]
#         # the subset of cor_list
#         cor_list_subset = cor_list[ind_subset]
#         return list_subset, cor_list_subset

#
# ################################################################
# # write out the output of the safe channels around GPS times
# ###############################################################
#
# def SaveTimeseriesHdf5(GPSs, ids, re_sfchs, ifo, wd, outputpath, outputfilename):
#     '''
#         This extracts the output of the safe channels around the GPS times
#         input:
#             GPSs: the list of GPS times for a given glitch
#             ids: the list of unique IDs labeled GravitySpy
#             re_sfchs: the list the safe channels
#             ifo: kind of interferomter {'H1', 'L1', 'V1'}
#             wd: forward and backward time window around the peak GPS time
#             outputpath: Path of an output directory
#             outputfilename: the name of a output file where the file stores the time series of a glitch
#         output:
#             hdf5 file
#             data structure: group: "gps0" has dataset "data0", "data1", ...
#                             dataset: "data0" has times series of every safe channels
#                             dataset attributes: "GPS", "channel", "SamplingRate"
#         description:
#             1- if there is a group about a GPS exists
#                 1- if there are full channel data sets stored in this group
#                     1- if there actually time series stored in each data set
#                         -> skip this group and go to the next GPS iteration
#                     2- if there is at least an empty data set in this group
#                         -> delete this group and re-create this group, then extract data sets from the server
#                 2- if there are no or not enough data sets stored in this group
#                     -> delete this group and re-create this group, then extract data sets from the server
#
#         USAGE:write_output_SafeChannel('test.')
#         '''
#     # the path and the file name
#     ouputPathFileName = os.path.join(outputpath, outputfilename)
#     # make a log file
#     logFileName = '{}LOG.txt'.format(outputfilename.split('.')[0]) # log file name
#     logfilePathName =  os.path.join(outputpath, logFileName) # # path and the file name of the log file
#     logfileobj = open(logfilePathName, 'w') # Truncate file to zero length or create text file for writing
#
#     # looping with GPS times and the list of id
#     for group_id, (GPS, id)in enumerate(zip(GPSs, ids), 0):
#         try:  # create a file in hdf5 format it there is no such a file
#             f = h5py.File(ouputPathFileName, 'a')
#         except IOError:
#             f = h5py.File(ouputPathFileName, 'r+')
#         # note, group_id is used as a label for a group in hdf5 file
#         # make a group in the file as, for example, gps00001
#         try: # create a group if it does not exist
#             g = f.create_group("gps{:05d}".format(group_id))
#             # add attributes
#             g.attrs['GPS'] = GPS
#             g.attrs['ifo'] = ifo
#             g.attrs['id'] = id
#         except ValueError: # this group already exists
#             # read this group
#             g = f["gps{:05d}".format(group_id)]
#             # check there are every safe channels data stored in this group
#             # check if there are all the channels in this group (e.g., 815)
#             if len(g.keys()) == len(re_sfchs): # yes, there are data sets with the same number of the channels
#                 # check if there are actually time series stored in this dataset
#                 for dset_name in g.keys():
#                     if g['{}'.format(dset_name)].value.shape[0] == 0: # no, some dataset is empty
#                         argu = 'there is at least an empty dataset' # for the next evaluation
#                         break # get out this loop
#                     else: # yes, all the data sets are filled with data
#                         argu = 'there are full datasets' # for the next evaluation
#                         pass
#                 if argu == 'there is at least one empty dataset':
#                     del f['gps{:05d}'.format(group_id)]  # delete this group
#                 elif argu == 'there are full datasets':
#                     print('group{:05d} out of {} groups: there are full datasets around GPS time {}'.format(group_id, len(GPSs),GPS))
#                     log1 = 'group{:05d} out of {} groups at GPS time {} was already done. ID:{}'.format(group_id, len(GPSs),GPS, id)
#                     logfileobj.write(log1) # take a log
#                     logfileobj.write('\n') # to the next line
#                     continue # go to the next GPS iteration
#             else: # no, there are less or no data sets with the same number of the channels
#                 del f['gps{:05d}'.format(group_id)]  # delete this group
#             try:  # re-create this group
#                 g = f.create_group("gps{:05d}".format(group_id))
#                 g.attrs['GPS'] = GPS
#                 g.attrs['ifo'] = ifo
#                 g.attrs['id'] = id
#             except ValueError:
#                 pass
#
#         # the list of channel names
#         chnames = ['{0}:{1}'.format(ifo, i) for i in re_sfchs]
#         # start time
#         tstart = GPS - wd
#         # end time
#         tend = GPS + wd
#         try: # extract time series from the server
#             tbeg = time.time()
#             print('group{:05d} out of {} groups : taking data set around GPS time {} ...'.format(group_id, len(GPSs), GPS))
#             FrameType = '{}_R'.format(ifo)
#             data = TimeSeriesDict.get(chnames, tstart, tend, frametype=FrameType, verbose=True)
#             tfin = time.time()
#             print('It took {0} sec for getting data at {1}'.format(float(tfin-tbeg), GPS))
#             # make log
#             log2 = 'group{:05d} out of {} groups at GPS time {} is now done. ID:{}'.format(group_id, len(GPSs),GPS, id)
#             logfileobj.write(log2) # take a log
#             logfileobj.write('\n')  # go to the new line
#         except IndexError:
#             # this gives, e.g., IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ISI-GND_STS_ETMY_X_DQ offset: 1153856716)
#             log3 = 'IndexError: group{:05d} out of {} groups st GPS time {} was SKIPPED. ID:{}'.format(group_id, len(GPSs), GPS, id)
#             print(log3)
#             logfileobj.write(log3) # take a log
#             logfileobj.write('\n')  # go to the new line
#             continue # go to the next GPS iteration
#         # store the data
#         for dset_id, channel in enumerate(chnames, 0):
#             # time series of the data
#             ts = data[channel].value
#             # sampling rate in Hz
#             samplingrate = data[channel].sample_rate.value
#             # store the time series with compression
#             dset = g.create_dataset("ch{:05d}".format(dset_id), data=ts, compression="gzip", compression_opts=9)
#             # make attributes on the data set 'dset'
#             dset.attrs['channel'] = channel.split(':')[1]
#             dset.attrs['SamplingRate'] = samplingrate
#         f.close()
#     logfileobj.close()
#     return None
#
#



[docs]def Multiprocess_ConvertToTable(cache_indiv, trigger_pipeline, IFO, Columns):
    '''
    description:
        Multi process does not work if I include this inside the class IdentifyGlitch() so that I define this here globally

    :param cache_indiv: an individual cache (each trigger file)
    :param trigger_pipeline: a name of trigger pipeline {omicron, pycbc-live}
    :param IFO: a name of the detector {L1, H1}
    :param Columns: a list of columns for a metadata. This is None for omicron trigger as it is nother to do
    :return: df_indiv: a metadata of triggers in pandas frame
    '''
    if trigger_pipeline == 'omicron':
        try:
            t = EventTable.read(cache_indiv, format='ligolw', tablename='sngl_burst')
            df_indiv = t.to_pandas()  # convert it to a pandas table
        except KeyError:
            df_indiv = pd.DataFrame()
        except ValueError:
            df_indiv = pd.DataFrame()
    elif trigger_pipeline == 'pycbc-live':
        try:
            t = EventTable.read(cache_indiv, format='hdf5.pycbc_live', ifo='{}'.format(IFO), columns=Columns)
            df_indiv = t.to_pandas()  # convert it to pandas frame
        except KeyError:
            df_indiv = pd.DataFrame()
    return df_indiv


[docs]class IdentifyGlitch():

    def __init__(self):
        pass

[docs]    def FindObservingTimeSegments(self, IFO, startT, endT, outputMother_dir, state='observing'):
        '''
        description:
            1. take a DataQualityFrag from a server
            2. save its segment data as a HDF file
            3. take active segments
            4. return active segments as a numpy array
        USAGE: SegmentsMat, trigger_dir = FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state)

        :param IFO: a type of interferometer, 'L1' or 'H1'
        :param startT: (float, int or string: e.g., 'Dec 8 2016') starting time
        :param endT: ending time
        :param outputMother_dir: an ouput directory where the HDF5 file will be stored
        :param state: state of an interferometer, {observing, nominal-lock}
        :return: SegmentsMat: active segments in a numpy array

        '''
        # choose a state channel
        if state == 'observing':
            StateChannel = 'DMT-ANALYSIS_READY:1'
        elif state == 'nominal-lock':
            StateChannel = 'DMT-GRD_ISC_LOCK_NOMINAL:1'
        else:
            print('Please a state of this interferometer either "observing" or "nominal-lock"')
            sys.exit()
        # take segments data from a server
        segs = DataQualityFlag.query('{0}:{1}'.format(IFO, StateChannel), startT, endT)
        # path of an output directory
        trigger_dir = os.path.join(outputMother_dir, '{}_{}'.format(int(startT), int(endT)))
        try:
            os.mkdir(trigger_dir)
        except OSError:
            pass
        # path of an output file
        output_path = os.path.join(trigger_dir, 'DataQualityFlag.h5')
        # save it as a HDF5 file
        try:
            segs.write(output_path)
        except IOError: # if this file already exists
            # delete the file
            os.remove(output_path)
            # re-create the file
            segs.write(output_path)
        # import this HDF5 file
        segs_h5 = h5py.File(output_path, 'r')
        # get active segments
        Active_segs = segs_h5['{0}:{1}'.format(IFO, StateChannel)]['active'][()]
        SegmentsMat = np.zeros((Active_segs.size, 2))
        for ind, seg in enumerate(Active_segs, 0):
            SegmentsMat[ind, 0] = seg[0]  # starting time
            SegmentsMat[ind, 1] = seg[2]  # ending time
        SegmentsMat = SegmentsMat.astype(int)
        return SegmentsMat, trigger_dir


[docs]    def OmimcronTriggerPath(self, ListSegments, TriggerDir, IFO, output_file='omicron.txt', channel='GDS-CALIB_STRAIN'):
        '''
        description:
            1. log-in either Livingston or Hanford cluster
            2. take the path of files storing omicron triggers during a segment
            3. save those paths into an output file
         USAGE: OmimcronTriggerPath('L1', ListSegments, '/home/kentaro.mogushi/longlived/OmicronTrigger', IFO)

        :param IFO: interferometer (L1, or H1)
        :param ListSegments: (list of lists) [[s1, e1], [s2, e2], ...], this is because I want to exclude segments of non-observing times
        :param channel: (str) the name of a channel
        :param trigger_dir: (str) an output directory
        :param output_file: (str) an output file
        :param: IFO:
        :return: trigger_dir

        '''
        output_path = os.path.join(TriggerDir, output_file)
        for ObservingSegment in ListSegments:
            startT, endT = ObservingSegment[0], ObservingSegment[1]
            # save paths of all the omicron trigger files between startT and eneT to output_path
            os.system('/home/detchar/opt/gwpysoft-2.7/bin/trigfind {0}:{1} omicron {2} {3} >> {4}'.format(IFO, channel, startT, endT, output_path))
        print('Done for getting a list of paths of trigger files')
        return None


[docs]    def CopyOmicroTriggerandUnzip(self, input_file, input_dir, TriggerDir, output_dir='OmicronTriggerXML'):
        '''
        description:
            1. load a file comprising all the paths of omicron trigger files you are interested in
            2. copy all the omicron trigger file in your working place
            3. go to an output directory
            4. unzip all the files and replace them with the zipped files
            5. go back to a working directory
        USAGE: CopyOmicroTriggerandUnzip('omicron.txt', trigger_dir, trigger_dir)

        :param input_file: an input file
        :param input_dir:  an input directory, in default, a current directory
        :param trigger_dir: # a directory right above an output directory
        :param output_dir: # an output directory where all the omicron trigger files will be stored
        :return: None

        '''
        #++++++++++++++ load a file and copy ++++++++++++++++++++++++++
        input_path = os.path.join(input_dir, input_file) # input path
        f = np.loadtxt(input_path, dtype=np.str) # load an input file
        output_path = os.path.join(TriggerDir, output_dir) # output path
        try:
            os.mkdir(output_path) # mkdir an output directory
        except OSError:
            pass # do nothing if the directory already exists
        for i in f: # iterate through paths of omicron triggers stored in an input file
            copy = 'cp {0} {1}'.format(i, output_path) # make the command line used in the next line
            os.system(copy) #  copy omicron trigger files into your working place
        #+++++++++ unzip them +++++++++++++++++++++++++
        os.chdir(output_path) # go to an output directory
        os.system('for f in *.gz ; do gunzip -f "$f" > ./"${f%.*}" ; done') # unzip all the file
        # -f replaces the original .xml.gz file with .xml
        # -c unzip .xml.gz file and also keep the .xml file
        os.chdir(TriggerDir) # go back to a working directory
        return None

[docs]    def readXML(self, input_dir, input_file):
        '''
        description:
            1. take meta data stored s a .xml file in a directory named OmicronTriggerXML
            2. meta data matrix in the form of list
        USAGE: TriggerMat = readXML(input_dir, input_file)

        :param input_dir: an input directory
        :param input_file:  input file name
        :return: name_a: (numpy array)
        '''
        # read a XML file
        path_input = os.path.join(input_dir, input_file) # input path
        tree = ET.parse(path_input) # tree of an input file
        root = tree.getroot() # root of an input file
        #++++++++ get the omicron trigger's data set++++++++++++++++
        g = []
        for i in root[2]:
            a = str(i.text)
            g.append(a)
        # trim meta data for the sake of convenience
        h = np.array(g)  # convert list object to numpy array
        ind_nan = np.where(h == 'None')  # find the indices where element is 'None'
        h_new = np.delete(h, ind_nan)  # without nan
        a = h_new[0].split('\n\t\t\t')  # split
        a[-1] = a[-1].replace('\n\t\t', '')  # remove the last as the last one is redundant
        a_new = a[1:] # get rid of the first one which is redundant and make another array
        # +++++++++ get the name of the data set (columns)++++++++++++++++
        name_list = []
        # name column names
        for i in root[2]:
            names = str(i.attrib)
            name_list.append(names)
        for idx in range(len(name_list)):
            name_list[idx] = name_list[idx].split(':')[-1].replace("'}", "")
        # remove the last one
        #name_list = name_list[:-1]
        #name_com = name_list[0] + ''
        # combine every name
        name_com = ''
        for i in name_list:
            name_com = name_com + i + ','
        # remove the ',' at the end and make it as list
        name_com = [name_com[:-1]]
        # combine the name and numbers
        name_a = name_com + a_new
        return name_a

[docs]    def ExtractOmcronTriggerMetadata(self, name_a):
        '''
        description:
            1. load meta data matrix, expected to be created by readXML()
            2. re-arrange for the sake of convenience and convert it in a numpy array
            3. return numpy re-arranged meta data matrix
        USAGE: Matdataset = ExtractOmcronTriggerMetadata(name_a)

        :param name_a: omicron trigger info (list)
        :return: Matdataset: omicron trigger metadata (numpy array)
        '''
        # create the list of peak GPS times and durations
        Matdataset = np.array([])
        a_new = name_a[1:]
        for i in a_new:
            peakGPS = float(i.split(',')[1]) # peak GPS time
            peakGPSns = float('0.'+i.split(',')[2]) # peak GPS time ns
            peakGPSfull = peakGPS+peakGPSns # full peak GPS time
            startGPS = float(i.split(',')[3]) # start GPS time
            startGPSns = float('0.'+i.split(',')[4]) # start GPS time ns
            startGPSfull = startGPS+startGPSns # full start GPS time
            peakFreq = float(i.split(',')[9])
            snr = float(i.split(',')[14])
            Amplitude = float(i.split(',')[13])
            centralFreq = float(i.split(',')[10])
            duration = float(i.split(',')[5]) # duration
            bandwidth = float(i.split(',')[11])
            chisq = float(i.split(',')[16])
            chisqDof = float(i.split(',')[17])
            confidence = float(i.split(',')[15])
            id = None
            ifo = i.split(',')[0].split('"')[1]
            DataList = np.array([peakGPSfull, startGPSfull, peakFreq, snr, Amplitude, centralFreq, duration, bandwidth, chisq, chisqDof, confidence, id, ifo])
            Matdataset = np.vstack([Matdataset, DataList]) if Matdataset.size else DataList
        return Matdataset


[docs]    def SaveMetaDataAsCSV(self, MetaData, output_dir, output_file):
        '''
        description:
            1. load trigger meta data matrix, which is expected to be created by ExtractOmcronTriggerMetadata()
            2. add label for these triggers with 'unknown'
            3. add imgUrl with None
            4. save this matrix as .csv file
        USAGE: SaveMetaDataAsCSV(AllMatDataStr, trigger_dir, 'OmicrontriggerMetadata.csv')

        :param MetaData: numpy array, omicron trigger metadata
        :param output_file: an output file
        :param output_dir:  and output directory
        :return: None
        '''
        Names = ['GPStime',
                 'startGPStime',
                 'peakFreq',
                 'snr',
                 'amplitude',
                 'centralFreq',
                 'duration',
                 'bandwidth',
                 'chisq',
                 'chisqDof',
                 'confidence',
                 'id',
                 'ifo']
        df = pd.DataFrame(MetaData, columns=Names)
        df = df.assign(label=['arbitrary'] * len(df))
        df = df.assign(imgUrl=[None] * len(df))
        output_path = os.path.join(output_dir, output_file)
        df.to_csv(output_path)
        return None



[docs]    def CombinedIndentifyingProcess(self, IFO, ListSegments, TriggerDir):
        '''
        description:
            1. use OmimcronTriggerPath() to log in either L1 or H1 cluster and get a list of trigger files paths
            2. use CopyOmicroTriggerandUnzip() to copy trigger files and unzip them
            3. iterate with trigger XML files
                3-1. use readXML() to get mata data stored in a .xml file
                3-2. use ExtractOmcronTriggerMetadata() to re-arrange meta data matrix
            4. save the matrix as .csv file
            5. copy the .csv file into the CIT cluster and go back to the CIT cluster
        USAGE:  CombinedIndentifyingProcess(self, 'L1', ListSegments, '/home/kentaro.mogushi/longlived/OmicronTrigger')

        :param IFO: ifo {H1, L1, V1}
        :param ListSegments: a list of segments
        :param TriggerDir: a mother directory of a trigger file
        :return: None
        '''
        print('Begin to get trigger meta data')
        #  get a list of trigger files paths
        print('get trigger files paths')
        self.OmimcronTriggerPath(ListSegments, TriggerDir, IFO)
        # copy trigger files and unzip them
        print('copy trigger files and unzip them')
        self.CopyOmicroTriggerandUnzip('omicron.txt', TriggerDir, TriggerDir)
        # path of trigger XML files
        omicrontriggerXMLdir = os.path.join(TriggerDir, 'OmicronTriggerXML')
        # a list of trigger files
        ListTriggerFiles = os.listdir(omicrontriggerXMLdir)
        AllMatData = np.array([])
        print('creating meta data matrix')
        for TrigerFile in ListTriggerFiles: # iterate with trigger files
            TriggerMat = self.readXML(omicrontriggerXMLdir, TrigerFile) # get trigger meta data
            Matdataset = self.ExtractOmcronTriggerMetadata(TriggerMat) # re-arrange this meta data
            AllMatData = np.vstack([AllMatData, Matdataset]) if AllMatData.size else Matdataset # combine matrices
        self.SaveMetaDataAsCSV(AllMatData, TriggerDir, 'OmicrontriggerMetadata.csv') # save the matrix as .csv file
        print('Done!')
        return None


[docs]    def pyCBC_SNR_filter(self, all_pycbc_triggers_frame, SNR_low_cut, SNR_high_cut):
        '''
        description:
            band pass filter with SNR for pycbc triggers
        USAGE: df_SNR_cut = pyCBC_SNR_filter(all_pycbc_triggers_frame, SNR_low_cut=7.5, SNR_high_cut=150)

        :param all_pycbc_triggers_frame: a pandas frame of all the pyCBC triggers
        :param SNR_low_cut: a lower cutoff SNR
        :param SNR_high_cut: a higher cutoff SNR
        :return: df_new: pycbc triggers that pass the all the condition defined above
        '''
        df_new = all_pycbc_triggers_frame[(all_pycbc_triggers_frame['snr'] >= SNR_low_cut) & (all_pycbc_triggers_frame['snr'] < SNR_high_cut)]
        return df_new

[docs]    def pyCBC_template_duration_filter(self, pycbc_triggers_frame, low_cutoff_duration, high_cutoff_duration):
        '''
        description:
            band pass filter with template duration for pycbc triggers
        USAGE: df_template_duration_cut = pyCBC_template_duration_filter(pycbc_triggers_frame, low_cutoff_duration, high_cutoff_duration)

        :param pycbc_triggers_frame: pyCBC triggers in pandas frame
        :param low_cutoff_duration: a lower cutoff of the template duration in sec
        :param high_cutoff_duration: a higher cutoff of the template duration in sec
        :return: df_new: triggers with template druation between low_cutoff_duration and high_cutoff_duration
        '''
        df_new = pycbc_triggers_frame[(pycbc_triggers_frame['template_duration'] >= low_cutoff_duration) & (pycbc_triggers_frame['template_duration'] < high_cutoff_duration)]
        return df_new

[docs]    def pyCBC_totalmass_filter(self, pycbc_triggers_frame, low_cutoff_totalmass, high_cutoff_totalmass):
        '''
        description
            band pass filter with total mass for pycbc triggers
        USAGE: df_totalmass_cut = pyCBC_totalmass_filter(pycbc_triggers_frame, low_cutoff_totalmass, high_cutoff_totalmass)

        :param pycbc_triggers_frame: pyCBC triggers in pandas frame
        :param low_cutoff_totalmass: a lower cutoff of the total mass in solar mass
        :param high_cutoff_totalmass: a higher cutoff of the total mass in solar mass
        :return: triggers with total mass between low_cutoff_totalmass and high_cutoff_totalmass
        '''
        pycbc_triggers_frame = pycbc_triggers_frame.assign(totalmass=pycbc_triggers_frame.mass1+pycbc_triggers_frame.mass2)
        df_new = pycbc_triggers_frame[(pycbc_triggers_frame['totalmass'] >= low_cutoff_totalmass) & (pycbc_triggers_frame['totalmass'] < high_cutoff_totalmass)]
        return df_new

[docs]    def pyCBC_massratio_filter(self, pycbc_triggers_frame, low_cutoff_massratio, high_cutoff_massratio):
        '''
        description
            band pass filter with total mass for pycbc triggers
        USAGE: df_massratio_cut = pyCBC_query_massratio(pycbc_triggers_frame, low_cutoff_massratio, high_cutoff_massratio)

        :param pycbc_triggers_frame: pyCBC triggers in pandas frame
        :param low_cutoff_totalmass: a lower cutoff of the total mass in solar mass
        :param high_cutoff_totalmass: a higher cutoff of the total mass in solar mass
        :return: df_new: triggers with mass ratio between low_cutoff_massratio and high_cutoff_massratio
        '''
        pycbc_triggers_frame = pycbc_triggers_frame.assign(massratio=pycbc_triggers_frame.mass1/pycbc_triggers_frame.mass2)
        df_new = pycbc_triggers_frame[(pycbc_triggers_frame['massratio'] >= low_cutoff_massratio) & (pycbc_triggers_frame['massratio'] < high_cutoff_massratio)]
        return df_new


[docs]    def calculate_chisqr_weighted_snr(self, snr, chisq, chisq_dof):
        '''
        description:
            calculate the chi-square weighted SNR
            Reference: Macleod et al. 2015, Equation (21)
        USAGE: chisqr_weighted_snr = calculte_chisqr_weighted_snr(self, snr, chisqr, chisqr_dof)

        :param snr: SNR
        :param chisqr: chi-square
        :param chisqr_dof: chi-square degrees of freedom
        :return: chisqr_weighted_snr: the chi-square weighted SNR
        '''
        if chisq <= chisq_dof:
            chisq_weighted_snr = snr
        elif chisq > chisq_dof:
            chisq_weighted_snr = snr * ( (1 + ( chisq / chisq_dof ) ** 3 )/2. )**(-1/6.)
        return chisq_weighted_snr



[docs]    def pyCBC_chisqr_weighted_snr_filter(self, pycbc_triggers_frame, chisqr_weighted_snr_lower_cutoff):
        '''
        description
            high pass filter with chi square weighted SNR for pycbc triggers
        USAGE: df_chi_square_weighted_cut = pyCBC_chisqr_weighted_snr_filter(pycbc_triggers_frame, chisqr_weighted_snr_lower_cutoff)

        :param pycbc_triggers_frame:  pyCBC triggers in pandas frame
        :param chisqr_weighted_snr_lower_cutoff: a lower cutoff chi-square weighted snr
        :return: df_new: triggers with  chi square weighted SNR above chisqr_weighted_snr_lower_cutoff
        '''
        pycbc_triggers_frame = pycbc_triggers_frame.assign(chisqr_weighted_snr= np.vectorize(self.calculate_chisqr_weighted_snr)(pycbc_triggers_frame.snr, pycbc_triggers_frame.chisq, pycbc_triggers_frame.chisq_dof.astype('float')))
        df_new = pycbc_triggers_frame[pycbc_triggers_frame.chisqr_weighted_snr > chisqr_weighted_snr_lower_cutoff]
        return df_new


[docs]    def pyCBC_query_outlier(self, pycbc_triggers_frame, BinNum, Nsigma, cut):
        '''
        description:
            1. bin the triggers with  values of log10 of chi-square per DOF
            2. calculate the lower  bound of log10 of chi-square per DOF in the bin
            3 (cut = 'median')  Calculate the median of log10 of SNR and log10 of chi square per degree of freedom in the bin
            3.(cut = 'mad') Calculate  the upper bound  of log10 of SNR in the bin where the lower bound is the median minus the median absolute deviation, the upper  bound is the median plus the median absolute deviation
            4. Polynomial fit the upper bound of log10 of SNR as a function of the lower bound of log10 of chi-square per DOF
            5. split triggers using the polynomial fit to loud and quiet triggers
        USAGE: pycbc_loud = pyCBC_query_outlier(pycbc_triggers_frame, BinNum=50, Nsigma=1, cut='median')

        :param pycbc_triggers_frame: pycbc triggers in pandas frame
        :param BinNum: the number of bins of a histogram for log10 of chi squar per degrees of freedom
        :param Nsigma: an integer to determine the upper bound of the quiet triggers
        :param cut: a method of cut {median or mad}
        :return: pycbc_loud: loud triggers
        '''

        # log10 of SNR
        pycbc_triggers_frame = pycbc_triggers_frame.assign(log_SNR= np.log10(pycbc_triggers_frame.snr))
        # log10 of  chi-square per its degree of freedom
        pycbc_triggers_frame = pycbc_triggers_frame.assign(log_chisqr_per_dof=np.log10(pycbc_triggers_frame.chisq/pycbc_triggers_frame.chisq_dof))
        Bins_log_chisqr_per_dof = np.linspace(pycbc_triggers_frame.log_chisqr_per_dof.min(), pycbc_triggers_frame.log_chisqr_per_dof.max(), BinNum, endpoint=True)
        if cut =='median':
            # ------------------------------------------------------------------------------------+
            # important outlier is lower part of off-diagonal line of chi_square per DQF VS SNR  |
            # ------------------------------------------------------------------------------------+
            # a list of medians of log10 of SNR
            list_median_log_SNR = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_SNR.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
            # a list of medians of log10 of chi-square per its degree of freedom
            list_median_log_chisqr_per_dof = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_chisqr_per_dof.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
            #-----------------------------------+
            # clean up the representative curve |
            #-----------------------------------+
            # find the index of elements =  nan for the median values ----------
            list_index_nan_median_log_SNR = np.where(np.isnan(list_median_log_SNR) == True)[0]  # for median of log SNR
            list_index_nan_median_log_chisqr_per_dof = np.where(np.isnan(list_median_log_chisqr_per_dof) == True)[0]  # for median of log of chisqr per DOF
            # concatenate the lists of indices and keep non-dupulicated ones
            list_drop_index = np.unique(np.concatenate((list_index_nan_median_log_SNR, list_index_nan_median_log_chisqr_per_dof)))
            # delete the elements = nan
            list_median_log_SNR_cleaned = np.delete(list_median_log_SNR, list_drop_index)
            list_median_log_chisqr_per_dof_cleaned = np.delete(list_median_log_chisqr_per_dof, list_drop_index)
            # ----------+
            # best fit  |
            # ----------+
            # the polynomial curve is y = p[0] * x**3 + p[1] * x**2 + p[2] * x  + p[3]
            # where x is log 10 of median values of chi square per degrees of freedom
            # y is log10 of SNR
            p = np.polyfit(list_median_log_chisqr_per_dof_cleaned, list_median_log_SNR_cleaned, 3)
            #------------+
            # split      |
            #------------+
            # loud pycbc triggers are triggers with SNR above the upper cut defined above
            pycbc_loud = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR > p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]]
            # loud pycbc triggers are triggers with SNR below the upper cut defined above
            pycbc_quite = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR <= p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]]
        elif cut =='mad':
            # ------------------------------------------------------------------------------------+
            # important outlier is lower part of off-diagonal line of chi_square per DQF VS SNR  |
            # ------------------------------------------------------------------------------------+
            # a list of medians of log10 of SNR
            list_median_log_SNR = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_SNR.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
            # a list of medians of log10 of chi-square per its degree of freedom
            list_median_log_chisqr_per_dof = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_chisqr_per_dof.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
            # a list of median absolute deviations of log10 of SNR
            list_mad_log_SNR = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_SNR.mad() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
            # a list of median absolute deviations of log10 of chi-square per its degree of freedom
            list_mad_log_chisqr_per_dof = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_chisqr_per_dof.mad() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
            #-----------------------------------+
            # clean up the representative curve |
            #-----------------------------------+
            # find the index of elements =  nan or zero for the upper bound
            list_index_nan_mad_log_SNR = np.where(np.isnan(list_mad_log_SNR) == True)[0] # nan for median of log SNR
            list_index_zero_mad_log_SNR = np.where(list_mad_log_SNR == 0)[0] # zero for median of log SNR
            list_index_nan_mad_log_chisqr_per_dof = np.where(np.isnan(list_mad_log_chisqr_per_dof) == True)[0] # nan for median of log of chisqr per DOF
            list_index_zero_mad_log_chisqr_per_dof = np.where(list_mad_log_chisqr_per_dof == 0)[0] # nan for median of log of chisqr per DOF
            # concatenate the lists of indices and keep non-dupulicated ones
            list_drop_index_bd = np.unique(np.concatenate((list_index_nan_mad_log_SNR, list_index_zero_mad_log_SNR, list_index_nan_mad_log_chisqr_per_dof, list_index_zero_mad_log_chisqr_per_dof)))
            # delete the elements = nan or zero
            list_upper_bd_log_SNR = np.delete(list_median_log_SNR, list_drop_index_bd) + Nsigma * np.delete(list_mad_log_SNR, list_drop_index_bd)
            list_lower_bd_log_chisqr_per_dof = np.delete(list_median_log_chisqr_per_dof, list_drop_index_bd) - Nsigma * np.delete(list_mad_log_chisqr_per_dof, list_drop_index_bd)
            #----------+
            # best fit |
            #----------+
            # the polynomial curve is y = p[0] * x**3 + p[1] * x**2 + p[2] * x  + p[3]
            # where x is log 10 of median + Nsigma * median absolute deviation of chi square per degrees of freedom
            # y is log10 of SNR
            p = np.polyfit(list_lower_bd_log_chisqr_per_dof, list_upper_bd_log_SNR, 3)
            #-------+
            # split |
            #-------+
            # loud pycbc triggers are triggers with SNR above the upper cut defined above
            pycbc_loud = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR > p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]]
            # loud pycbc triggers are triggers with SNR below the upper cut defined above
            pycbc_quite = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR <= p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]]
        # #------------------
        # # plot
        # #------------------
        # import matplotlib
        # matplotlib.use('Agg')
        # import matplotlib.pyplot as plt
        # y = np.linspace(pycbc_triggers_frame.log_chisqr_per_dof.min(), pycbc_triggers_frame.log_chisqr_per_dof.max(),100)
        # x_poly = p[0] * y**3 + p[1] * y**2 + p[2] * y**1 + p[3]
        # plt.scatter(10**pycbc_loud.log_SNR, 10**pycbc_loud.log_chisqr_per_dof, s=3, color='blue', label='loud trigger')
        # plt.scatter(10 ** pycbc_quite.log_SNR, 10 ** pycbc_quite.log_chisqr_per_dof, s=3, color='cornflowerblue',label='quite trigger')
        # if cut == 'median':
        #     plt.scatter(10**list_median_log_SNR_cleaned, 10**list_median_log_chisqr_per_dof_cleaned, marker='x', color='r', label='median')
        #     plt.plot(10 ** x_poly, 10 ** y, ls='--', color='black', label='poly fit')
        # elif cut == 'mad':
        #     plt.scatter(10 ** list_upper_bd_log_SNR, 10 ** list_lower_bd_log_chisqr_per_dof, marker='x', color='magenta',label='ub')
        #     plt.plot(10 ** x_poly, 10 ** y, ls='-', color='black', label='poly fit ub')
        # plt.yscale('log')
        # plt.xscale('log')
        # plt.xlabel('SNR')
        # plt.ylabel('Chi-square per DOF')
        # plt.legend()
        # plt.savefig('pycbc.png')
        # plt.close()
        return pycbc_loud



[docs]    def pycbc_clustering_window_around_trigger(self, pycbc_trigger, IFO, one_sided_window, extension_duration):
        '''
        description:
            This is a clustering filter
            1. take a frame of pycbc triggers
            2. pick up a trigger with highest SNR in a window around a trigger. The window size is twice of "one_sided_window"
            3. create new columns required to run this code
        USAGE: pycbc_trigger = pycbc_clustering_window_around_trigger(pycbc_trigger, IFO, one_sided_window=0.1, extension_duration=1.5)

        :param pycbc_trigger: pycbc triggers in pandas frame format
        :param IFO: ifo
        :param one_sided_window: one sided window in seconds around a trigger for clustering,
        :param extension_duration: factor for extending the template duration, e.g., extension_duration = 1.5 makes the duration of the on-source window 1.5 times longer than that of the trigger
        :return: pycbc_trigger: clustered pycbc triggers
        '''
        pycbc_trigger = pycbc_trigger.sort_values(by=['end_time'])
        pycbc_trigger = pycbc_trigger.reset_index(drop=True)
        for index in pycbc_trigger.index:  # iterate sub-samples
            try:  # most of pycbc triggers are overlapped for a same burst, I remove redundant triggers
                #one_sided_window = 0.1  # one sided time window
                start_window = float(pycbc_trigger.loc[index]['end_time'] - one_sided_window)  # starting time of the window for a trigger
                end_window = float(pycbc_trigger.loc[index]['end_time'] + one_sided_window)  # ending time of the window for a trigger
                df_overlapped = pycbc_trigger[(start_window <= pycbc_trigger['end_time']) & (pycbc_trigger['end_time'] <= end_window)]  # triggers overlapping this trigger
                max_snr = df_overlapped['snr'].max()  # a trigger with a maximum trigger during this window
                event_ind = df_overlapped[df_overlapped['snr'] == max_snr]['end_time'].index[0]  # index for the trigger with the maximum SNR
                all_index = np.array(df_overlapped.index.tolist())  # a list of index of the overlapping triggers
                redundant_indecies = all_index[all_index != event_ind]  # a list of index of the triggers that are redundant
                pycbc_trigger = pycbc_trigger.drop(redundant_indecies, axis=0)  # drop the redundant triggers
                # saved_index = np.append(saved_index, event_ind) # save a trigger with highest SNR
            except KeyError:  # if this trigger is already removed, go to the next iteration
                continue
        #########################
        # get the meta data
        pycbc_trigger = pycbc_trigger.assign(GPStime=pycbc_trigger['end_time'])
        pycbc_trigger = pycbc_trigger.assign(duration=pycbc_trigger['template_duration'] * extension_duration)
        pycbc_trigger = pycbc_trigger.assign(ifo=[IFO] * pycbc_trigger.shape[0])
        pycbc_trigger = pycbc_trigger.assign(id=['TP{}'.format(i) for i in pycbc_trigger['template_id']])
        pycbc_trigger = pycbc_trigger.assign(label=['arbitrary'] * pycbc_trigger.shape[0])
        pycbc_trigger = pycbc_trigger.assign(imgUrl=[None] * pycbc_trigger.shape[0])
        pycbc_trigger = pycbc_trigger.assign(confidence=[0] * pycbc_trigger.shape[0])
        return pycbc_trigger

[docs]    def pycbc_clustering_timeslice(self, pycbc_trigger, IFO, startT, endT, window, extension_duration):
        '''
        description:
            This is a clustering filter
            1. take a frame of pycbc triggers
            2. pick a trigger with highest SNR in a window that is a time-sliced bin
            3. create new columns required to run this code
        USAGE: pycbc_trigger = pycbc_clustering_timeslice(pycbc_trigger, IFO, startT, endT, window=0.1, extension_duration=1.5)

        :param pycbc_trigger: pycbc triggers in pandas frame
        :param IFO: ifo
        :param startT: start time of an epoch
        :param endT: end time of an epoch
        :param window: a window length in sec for clustering
        :param extension_duration: factor for extending the template duration, e.g., extension_duration = 1.5 makes the duration of the on-source window 1.5 times longer than that of the trigger
        :return: pycbc_trigger: clustered pycbc triggers
        '''
        pycbc_trigger = pycbc_trigger.sort_values(by=['end_time'])
        pycbc_trigger = pycbc_trigger.reset_index(drop=True)
        time_bins = np.arange(startT, endT, window)
        for start_bin in time_bins:

            df_overlapped = pycbc_trigger[(pycbc_trigger.end_time >= start_bin) & (pycbc_trigger.end_time < start_bin + window)]
            if df_overlapped.shape[0] != 0:
                max_snr = df_overlapped['snr'].max()  # a trigger with a maximum trigger during this window
                event_ind = df_overlapped[df_overlapped['snr'] == max_snr]['end_time'].index[0]  # index for the trigger with the maximum SNR
                all_index = np.array(df_overlapped.index.tolist())  # a list of index of the overlapping triggers
                redundant_indecies = all_index[all_index != event_ind]  # a list of index of the triggers that are redundant
                pycbc_trigger = pycbc_trigger.drop(redundant_indecies, axis=0)  # drop the redundant triggers
            else:
                continue
        # get the meta data
        pycbc_trigger = pycbc_trigger.assign(GPStime=pycbc_trigger['end_time'])
        pycbc_trigger = pycbc_trigger.assign(duration=pycbc_trigger['template_duration'] * extension_duration)
        pycbc_trigger = pycbc_trigger.assign(ifo=[IFO] * pycbc_trigger.shape[0])
        pycbc_trigger = pycbc_trigger.assign(id=['TP{}'.format(i) for i in pycbc_trigger['template_id']])
        pycbc_trigger = pycbc_trigger.assign(label=['arbitrary'] * pycbc_trigger.shape[0])
        pycbc_trigger = pycbc_trigger.assign(imgUrl=[None] * pycbc_trigger.shape[0])
        pycbc_trigger = pycbc_trigger.assign(confidence=[0] * pycbc_trigger.shape[0])
        return pycbc_trigger


[docs]    def GetTriggerMetadata(self, ListSegments, IFO, output_dir, number_process, trigger_pipeline='omicron', output_file='TriggerMetadata.csv', channel='GDS-CALIB_STRAIN'):
        '''
        description:
            1. take the path of files storing omicron triggers during a segment
            2. make a metadata in pandas frame
            3. save it as a .csv file
            Note: this function works samely as CombinedIndentifyingProcess() but it is faster. And it is supposed to support pyCBC trigger as well.
        USAGE: GetTriggerMetadata(ListSegments, IFO, output_dir, number_process, trigger_pipeline='omicron', output_file='TriggerMetadata.csv', channel='GDS-CALIB_STRAIN')

        :param ListSegments: (list of lists) [[s1, e1], [s2, e2], ...], this is because I want to exclude segments of non-observing times
        :param IFO: interferometer (L1, or H1)
        :param output_dir: an output directory
        :param output_file: a name of an output file
        :param number_process: a number maximum processes in parallel
        :param trigger_pipeline: trigger method 'omicron', 'pycbc-live'
        :param channel: (str) the name of a channel
        :return: ListXML: a list of trigger XML files
        '''
        if trigger_pipeline == 'omicron':
            ListTable = []
            for ObservingSegment in ListSegments: # iterate through a list of segments
                startT, endT = ObservingSegment[0], ObservingSegment[1]
                # get trigger files
                try:
                    cache = find_trigger_files('{0}:{1}'.format(IFO, channel), trigger_pipeline, startT, endT)
                except ValueError as e:
                    print("******************************")
                    print("You might be using inappropriate cluster. Please use a LLO/LHO cluster for L1/H1 omicron trigger")
                    print("Terminate this program.")
                    print("The original error message was")
                    print("==============================")
                    print(e.message)
                    sys.exit()
                # convert data to a table
                try:
                    t = EventTable.read(cache, format='ligolw', tablename='sngl_burst')
                    df = t.to_pandas()  # convert it to a pandas table
                    ListTable.append(df)
                except ValueError:
                    # convert each trigger file to a table with multi processing
                    with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
                        for df_indiv in executor.map(Multiprocess_ConvertToTable, cache, itrtls.repeat(trigger_pipeline), itrtls.repeat(IFO), itrtls.repeat(None)):
                            ListTable.append(df_indiv)
                except IndexError: # IndexError: cannot read EventTable from empty source list
                    pass
            FullTriggerSet = pd.concat(ListTable) # concatenate all the tables
            df_new = pd.DataFrame()
            # get the meta data
            df_new = df_new.assign(GPStime=FullTriggerSet['peak_time']+FullTriggerSet['peak_time_ns']*10**(-9))
            df_new = df_new.assign(startGPStime=FullTriggerSet['start_time'] + FullTriggerSet['start_time_ns'] * 10 ** (-9))
            df_new = df_new.assign(peakFreq=FullTriggerSet['peak_frequency'])
            df_new = df_new.assign(snr=FullTriggerSet['snr'])
            df_new = df_new.assign(amplitude=FullTriggerSet['amplitude'])
            df_new = df_new.assign(centralFreq=FullTriggerSet['central_freq'])
            df_new = df_new.assign(duration=FullTriggerSet['duration'])
            df_new = df_new.assign(bandwidth=FullTriggerSet['bandwidth'])
            df_new = df_new.assign(chisq=FullTriggerSet['chisq'])
            df_new = df_new.assign(chisqDof=FullTriggerSet['chisq_dof'])
            df_new = df_new.assign(confidence=FullTriggerSet['confidence'])
            df_new = df_new.assign(id=[None]*FullTriggerSet.shape[0])
            df_new = df_new.assign(ifo=FullTriggerSet['ifo'])
            df_new = df_new.assign(label=['arbitrary'] * FullTriggerSet.shape[0])
            df_new = df_new.assign(imgUrl=[None] * FullTriggerSet.shape[0])
            print('creating meta data matrix')
            output_path = os.path.join(output_dir, output_file)
            # save it as .csv file
            df_new.to_csv(output_path)
            print('Done!')
        elif trigger_pipeline == 'pycbc-live':
            ListTable = []
            for ObservingSegment in ListSegments:  # iterate through a list of segments
                startT, endT = ObservingSegment[0], ObservingSegment[1]
                # get trigger files
                cache = find_trigger_files('{0}:{1}'.format(IFO, channel), trigger_pipeline, startT, endT)
                # trim "file://"
                for index in range(len(cache)):
                    cache[index] = cache[index].split('file://')[1]
                cache = filter_pycbc_live_files(cache) # filter out empty files
                # metadata list
                #Columns = ['chisq', 'chisq_dof', 'coa_phase', 'end_time', 'f_lower', 'inclination',
                #            'mass1', 'mass2', 'sigmasq', 'snr', 'spin1', 'spin1y', 'spin1z', 'spin2x', 'spin2y',
                #            'spin2z', 'template_duration', 'template_hash', 'template_id']
                Columns = ['chisq', 'chisq_dof', 'coa_phase', 'end_time', 'f_lower',
                            'mass1', 'mass2', 'sigmasq', 'snr','template_duration', 'template_hash', 'template_id']
                #Columns = ['end_time', 'snr', 'template_duration', 'template_id']
                # convert each trigger files to a table with multi processing
                with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
                    for df_indiv in executor.map(Multiprocess_ConvertToTable, cache, itrtls.repeat(trigger_pipeline), itrtls.repeat(IFO), itrtls.repeat(Columns)):
                        ListTable.append(df_indiv) # fill a pandas table to a list
            FullTriggerSet = pd.concat(ListTable) # concatenate all the tables
            # save all the triggers
            output_path_full = os.path.join(output_dir, 'Fulltrigger.csv')
            FullTriggerSet.to_csv(output_path_full)
            print('# of triggers {}'.format(FullTriggerSet.shape[0]))
            del ListTable
            ########################
            # spacial treatment (pick up samples for analysis)
            # take triggers with SNR > 7.5 and template duration < 4 to avoid Gaussian noise and long varying noise
            df_new = self.pyCBC_template_duration_filter(FullTriggerSet, low_cutoff_duration=0, high_cutoff_duration=4)
            df_new = self.pyCBC_SNR_filter(df_new, SNR_low_cut=7.5, SNR_high_cut=150)
            #df_new = self.pyCBC_query_outlier(df_new, BinNum=50, Nsigma=1, cut='median') # outlier filter
            # clustering
            #df_new = self.pycbc_clustering_window_around_trigger(df_new, IFO, one_sided_window=0.1, extension_duration=1.5)
            df_new = self.pycbc_clustering_timeslice(df_new, IFO, startT, endT, window=0.1,  extension_duration=1.)
            print('creating meta data matrix')
            output_path = os.path.join(output_dir, output_file)
            # save it as .csv file
            df_new.to_csv(output_path)
            print('Done!')
        return None
        #return df_loud, df_quiet



[docs]    def FindGlitchNearestGPS(self, PathMetadataFile, candidate_GPS):
        '''
        description:
            1. take a omicron trigger meta data file
            2. find a glitch nearest to an input GPS, which you are concerned about
            3. replace the label of this glitch from 'arbitrary' to 'candidate'
            This function is assumed to be used in DQR.
            Once GraceDB provides a GPS time, this function labels a glitch nearest glitch given by Omicron Trigger as it can be considered as the most significant candidate of astronomical event.
            In this way, we can study only the candidate glitch by specifying glitch_type = candidate in a configuration file
        USAGE: largetSNR = FindGlitchNearestGPS(PathMetadataFile, GPS)

        :param PathMetadataFile: a path to omicron meta data file
        :param GPS: a GPS time to be concerned
        :return: None
        '''
        df = pd.read_csv(PathMetadataFile)
        start = candidate_GPS - 0.5 # start time of the target window, which is 0.5 sec before a candidate event
        end = candidate_GPS + 0.5  # end time of the target window, which is 0.5 sec afte ra candidate event
        # omicron triggers within the time window
        df_window = df[(df['GPStime'] > start) & (df['GPStime'] < end)]
        largetSNR = df_window['snr'].max()
        largestglitchIndex = df_window[df_window['snr'] == largetSNR].index[0]
        # replace the label of this glitch from 'arbitrary' to 'candidate'
        df.loc[largestglitchIndex, 'label'] = 'candidate'
        df.to_csv(PathMetadataFile)
        return largetSNR





#+++++++++++++++++++++++++++++++++++++
# find the allowed segments
#++++++++++++++++++++++++++++++++++++++

[docs]def Findglitchlist(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, UpperDurationThresh, LowerDurationThresh, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf, flag='Both'):
    '''
    description:
        1. load Gravity Spy data set (.csv file)
        2. get the data about the target glitch class
        3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
        4. accept glitches where their back ground segment do not coincide with any other glitches
        5. return the info of the accepted glitches
    USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, gap, flag)

    :param df: GravitySpy meta data in pandas format
    :param Epochstart: starting time of an epoch
    :param Epochend:  end time of an epoch
    :param Commissioning_lt: commissioning time in list
    :param TargetGlitchClass: a target glitch class name (str)
    :param IFO: a type of interferometer (H1, L1, V1)  (str)
    :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
    :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
    :param Confidence_thre: a threshold of confidence level (float or int )
    :param UpperDurationThresh: an upper bound of duration in sec (float or int)
    :param LowerDurationThresh: a lower bound of duration in sec (float or int)
    :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
    :param gap: a time gap between the target and the background segments in sec, 1 sec in default
    :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
    :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG  for target glitches
    :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
    :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
    :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
    :return: the list of parameters of glitches passing the above thresholds
        Listsegments contains of
            ListIndexSatisfied: a list of index of glitches
            Listtarget_timeseries_start: a list of a target glitch starting time
            Listtarget_timeseries_end: a list of a target glitch ending time
            Listpre_background_start: a list of preceding background starting time
            Listpre_background_end; a list of preceding background ending time
            Listfol_background_start: a list of following background starting time
            Listfol_background_end: a list of following background ending time
            Listgpstime: a list of GPS times
            Listduration: a list of durations
            ListSNR: a list of SNRs
            Listconfi: a list of confidence levels
            ListID: a list of IDs
    '''

    df_list = []
    # take the data in an Epoch
    for Epoch in Epoch_lt:
        StartT, EndT = Epoch[0], Epoch[1]
        df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
        df_list.append(df_epoch)
    df_int = pd.concat(df_list)

    #### improvement for duration 2019-04-05 #############
    #### full durations are too long for capturing the glitch signature so that the solution is to choose T90
    ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
    FractionEnergy = 0.9 # (90% of energy )
    # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
    #df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration'])
    df_int = df_int.assign(gap=[gap]*df_int.shape[0])
    df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90


    # make the time segments of glitches measured
    # I reserve duration +/- gap around a GPS time of a glitch

    if UserDefinedDuration > 0: # accept a user defined duration
        # redefine only the duration of the target glitch
        df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration
    else:
        pass

    df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap'])
    df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap'])

    # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
    df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15

    # data set containing only a target glitch class
    try: # gravity spy and omicron trigger mode
        df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo']==IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    except KeyError: # pyCBC mode as there is no column "peakFreq"
        df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    print('all the glitches in the class: {}'.format(df_glitch.shape[0]))


    # the list of values satisfied as secure
    ListIndexSatisfied = []
    Listtarget_timeseries_start = []
    Listtarget_timeseries_end = []
    Listpre_background_start = []
    Listpre_background_end = []
    Listfol_background_start = []
    Listfol_background_end = []
    Listgpstime = []
    Listduration = []
    ListSNR = []
    Listconfi = []
    ListID = []
    for ind in range(len(df_glitch)):


        # target glitch's peak GPS time and the duration
        gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
        duration = df_glitch['duration'].iloc[ind] # # a duration
        SNR = df_glitch['snr'].iloc[ind] # snr
        confi = df_glitch['confidence'].iloc[ind] # confidence level
        ID = df_glitch['id'].iloc[ind] # ID
        gap_individual = df_glitch['gap'].iloc[ind]

        # the starting and ending time of the target glitch
        target_timeseries_start = gpstime - duration * position_duration_bfr_centr
        target_timeseries_end = target_timeseries_start + duration

        # take the background
        # the background is around the target glitch
        # preceding background, which is a nearest neighbor segment before target_timeseries_start
        # following background, which is a nearest neighbor segment after target_timeseries_end

        background_duration = duration # duration of both the proceeding and following background
        pre_background_start = target_timeseries_start - gap_individual - background_duration
        pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual
        fol_background_start = target_timeseries_end + gap_individual
        fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration


        # security check of the preceding background
        # case 1 is where the background is coincident with the right edge of the reserved segment
        # case 2 is where the background is coincident with the left edge of the reserved segment
        # case 3 is where the reserved segment is inside of the background
        # case 4 is where the background is inside of the reserved segment
        # it violates the security of the background estimation if at least one of the cases occurs
        # i.e., pro_case1, 2, 3, and 4 should be zero

        pre_case1 = len(df_int[(df_int['reserved_time_end'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 1
        pre_case2 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_start'] < pre_background_end)]) # case 2
        pre_case3 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 3
        pre_case4 = len(df_int[(df_int['reserved_time_start'] < pre_background_start) & (df_int['reserved_time_end'] > pre_background_end)])  # case 4

        # security check of the following background
        fol_case1 = len(df_int[(df_int['reserved_time_end'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 1
        fol_case2 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_start'] < fol_background_end)]) # case 2
        fol_case3 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 3
        fol_case4 = len(df_int[(df_int['reserved_time_start'] < fol_background_start) & (df_int['reserved_time_end'] > fol_background_end)])  # case 4


        if flag == 'Both': # take both preceding and following back gounds
            if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0 and fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0:
                ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                Listtarget_timeseries_start.append(target_timeseries_start)
                Listtarget_timeseries_end.append(target_timeseries_end)
                Listpre_background_start.append(pre_background_start)
                Listpre_background_end.append(pre_background_end)
                Listfol_background_start.append(fol_background_start)
                Listfol_background_end.append(fol_background_end)
                Listgpstime.append(gpstime)
                Listduration.append(duration)
                ListSNR.append(SNR)
                Listconfi.append(confi)
                ListID.append(ID)
            else:
                continue
        if flag == 'Either': # accept if either a preceding or a following background is available
            if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0:
                if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where both a proceeding and a following are available
                    ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                    Listtarget_timeseries_start.append(target_timeseries_start)
                    Listtarget_timeseries_end.append(target_timeseries_end)
                    Listpre_background_start.append(pre_background_start)
                    Listpre_background_end.append(pre_background_end)
                    Listfol_background_start.append(fol_background_start)
                    Listfol_background_end.append(fol_background_end)
                    Listgpstime.append(gpstime)
                    Listduration.append(duration)
                    ListSNR.append(SNR)
                    Listconfi.append(confi)
                    ListID.append(ID)
                else: # the case where only a proceeding is available
                    ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                    Listtarget_timeseries_start.append(target_timeseries_start)
                    Listtarget_timeseries_end.append(target_timeseries_end)
                    Listpre_background_start.append(pre_background_start)
                    Listpre_background_end.append(pre_background_end)
                    Listfol_background_start.append(0) # fill zero
                    Listfol_background_end.append(0) # fill zero
                    Listgpstime.append(gpstime)
                    Listduration.append(duration)
                    ListSNR.append(SNR)
                    Listconfi.append(confi)
                    ListID.append(ID)
            else:
                if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where only a following is available
                    ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                    Listtarget_timeseries_start.append(target_timeseries_start)
                    Listtarget_timeseries_end.append(target_timeseries_end)
                    Listpre_background_start.append(0) # fill zero
                    Listpre_background_end.append(0) # fill zero
                    Listfol_background_start.append(fol_background_start)
                    Listfol_background_end.append(fol_background_end)
                    Listgpstime.append(gpstime)
                    Listduration.append(duration)
                    ListSNR.append(SNR)
                    Listconfi.append(confi)
                    ListID.append(ID)
                else: # the case where both are NOT available
                    print('Both a preceding and following are NOT available')
                    continue

    # zipped list of segments
    Listsegments = list(
        zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
            Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
            ListSNR, Listconfi, ListID))
    return Listsegments


#=======================================


[docs]def FindglitchlistLongestBG(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'):
    '''
    description:
        1. load Gravity Spy data set (.csv file)
        2. get the data about the target glitch class
        3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
        4. accept glitches where their back ground segment do not coincide with any other glitches
        5. return the info of the accepted glitches
    USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, flag)

    :param df: GravitySpy meta data in pandas format
    :param Epochstart: starting time of an epoch
    :param Epochend:  end time of an epoch
    :param Commissioning_lt: commissioning time in list
    :param TargetGlitchClass: a target glitch class name (str)
    :param IFO: a type of interferometer (H1, L1, V1)  (str)
    :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
    :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
    :param Confidence_thre: a threshold of confidence level (float or int )
    :param UpperDurationThresh: an upper bound of duration in sec (float or int)
    :param LowerDurationThresh: a lower bound of duration in sec (float or int)
    :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
    :param gap: a time gap between the target and the background segments in sec, 1 sec in default
    :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
    :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG  for target glitches
    :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
    :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
    :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
    :return: the list of parameters of glitches passing the above thresholds
        Listsegments contains of
            ListIndexSatisfied: a list of index of glitches
            Listtarget_timeseries_start: a list of a target glitch starting time
            Listtarget_timeseries_end: a list of a target glitch ending time
            Listpre_background_start: a list of preceding background starting time
            Listpre_background_end; a list of preceding background ending time
            Listfol_background_start: a list of following background starting time
            Listfol_background_end: a list of following background ending time
            Listgpstime: a list of GPS times
            Listduration: a list of durations
            ListSNR: a list of SNRs
            Listconfi: a list of confidence levels
            ListID: a list of IDs
    '''


    df_list = []
    # take the data in an Epoch
    for Epoch in Epoch_lt:
        StartT, EndT = Epoch[0], Epoch[1]
        df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
        df_list.append(df_epoch)
    df_int = pd.concat(df_list)


    #### improvement for duration 2019-04-05 #############
    #### full durations are too long for capturing the glitch signature so that the solution is to choose T90
    ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
    FractionEnergy = 1. # (90% of energy )
    # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
    # df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration'])
    df_int = df_int.assign(gap=[gap] * df_int.shape[0])
    df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90


    # make the target segments
    # I reserve duration +/- gap around a GPS time of a glitch

    if UserDefinedDuration > 0: # accept a user defined duration
        # redefine only the duration of the target glitch
        df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration
    else:
        pass

    df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap'])
    df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap'])

    # reserving_time does not include gap
    #df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'])
    #df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'])

    # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
    df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15


    # data set containing only a target glitch class
    try: # gravity spy and omicron trigger mode
        # for cWB triggers
        #df_glitch = df_int[(df_int.rho >= 10) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff)]
        df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    except KeyError: # pyCBC mode as there is no column "peakFreq"
        # df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
        # add central frequency cuts
        df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]

    print('all the glitches in the class: {}'.format(df_glitch.shape[0]))


    # the list of values satisfied as secure
    ListIndexSatisfied = []
    Listtarget_timeseries_start = []
    Listtarget_timeseries_end = []
    Listpre_background_start = []
    Listpre_background_end = []
    Listfol_background_start = []
    Listfol_background_end = []
    Listgpstime = []
    Listduration = []
    ListSNR = []
    Listconfi = []
    ListID = []
    for ind in range(len(df_glitch)):

        # target glitch's peak GPS time and the duration
        gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
        duration = df_glitch['duration'].iloc[ind] # # a duration
        SNR = df_glitch['snr'].iloc[ind] # snr
        confi = df_glitch['confidence'].iloc[ind] # confidence level
        ID = df_glitch['id'].iloc[ind] # ID
        gap_individual = df_glitch['gap'].iloc[ind]

        if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec
            pass # go to the next line
        else: # if the duration of a glitch is less than 0.02 sec
            continue # go to the next iteration



        # the starting and ending time of the target glitch
        target_timeseries_start = gpstime - duration * position_duration_bfr_centr
        target_timeseries_end = target_timeseries_start + duration

        # take the background
        # the background is around the target glitch
        # preceding background, which is a nearest neighbor segment before target_timeseries_start
        # following background, which is a nearest neighbor segment after target_timeseries_end

        background_duration = duration # duration of both the proceeding and following background
        pre_background_start = target_timeseries_start - gap_individual - background_duration
        pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual
        fol_background_start = target_timeseries_end + gap_individual
        fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration

        # security check of the preceding background
        # case 1 is where the background is coincident with the right edge of the reserved segment
        # case 2 is where the background is coincident with the left edge of the reserved segment
        # case 3 is where the reserved segment is inside of the background
        # case 4 is where the background is inside of the reserved segment
        # it violates the security of the background estimation if at least one of the cases occurs
        # i.e., pro_case1, 2, 3, and 4 should be zero

        pre_case1 = len(df_int[(df_int['reserved_time_end'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 1
        pre_case2 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_start'] < pre_background_end)]) # case 2
        pre_case3 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 3
        pre_case4 = len(df_int[(df_int['reserved_time_start'] < pre_background_start) & (df_int['reserved_time_end'] > pre_background_end)])  # case 4

        # security check of the following background
        fol_case1 = len(df_int[(df_int['reserved_time_end'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 1
        fol_case2 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_start'] < fol_background_end)]) # case 2
        fol_case3 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 3
        fol_case4 = len(df_int[(df_int['reserved_time_start'] < fol_background_start) & (df_int['reserved_time_end'] > fol_background_end)])  # case 4



        if flag == 'Both': # take both preceding and following back gounds
             if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0 and fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0:
                try:
                    # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
                    neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
                    if neareset_gap_pre >= duration:  # if the smallest gap is larger than the duration
                        if neareset_gap_pre > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                            neareset_gap_pre = 10 * duration  # replace it with the 10 time of the duration
                        else:
                            pass
                        if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                            pre_background_start = pre_background_end - neareset_gap_pre  # replace
                        else:
                            pass
                except IndexError: # there is no nearest glitch that satisfies the above condition
                    pass

                try:
                    # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
                    neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
                    if neareset_gap_fol >= duration:  # if the smallest gap is larger than the duration
                        if neareset_gap_fol > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                            neareset_gap_fol = 10 * duration  # replace it with the 10 time of the duration
                        else:
                            pass
                        if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                            fol_background_end = fol_background_start + neareset_gap_fol  # replace
                        else:
                            pass
                except IndexError: # there is no nearest glitch that satisfies the above condition
                    pass


                # #print('{:.1f}, {:.1f}, {:.1f}'.format(neareset_gap_pre, neareset_gap_fol, duration))
                print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))
                if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(fol_background_end - fol_background_start, 3) < round(duration, 3):
                    sys.exit()

                ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                Listtarget_timeseries_start.append(target_timeseries_start)
                Listtarget_timeseries_end.append(target_timeseries_end)
                Listpre_background_start.append(pre_background_start)
                Listpre_background_end.append(pre_background_end)
                Listfol_background_start.append(fol_background_start)
                Listfol_background_end.append(fol_background_end)
                Listgpstime.append(gpstime)
                Listduration.append(duration)
                ListSNR.append(SNR)
                Listconfi.append(confi)
                ListID.append(ID)
             else:
                 continue
        if flag == 'Either': # accept if either a preceding or a following background is available
            if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0:
                if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where both a proceeding and a following are available
                    try:
                        # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
                        neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
                        if neareset_gap_pre >= duration:  # if the smallest gap is larger than the duration
                            if neareset_gap_pre > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                                neareset_gap_pre = 10 * duration  # replace it with the 10 times of the duration
                            else:
                                pass
                            if duration < 10:  # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                                pre_background_start = pre_background_end - neareset_gap_pre  # replace
                            else:
                                pass
                    except IndexError:  # there is no nearest glitch that satisfies the above condition
                        pass
                    try:
                        # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
                        neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
                        if neareset_gap_fol >= duration:  # if the smallest gap is larger than the duration
                            if neareset_gap_fol > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                                neareset_gap_fol = 10 * duration  # replace it with the 10 times of the duration
                            else:
                                pass
                            if duration < 10:  # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                                fol_background_end = fol_background_start + neareset_gap_fol  # replace
                            else:
                                pass
                    except IndexError:  # there is no nearest glitch that satisfies the above condition
                        pass

                    print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))
                    if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(fol_background_end - fol_background_start, 3) < round(duration, 3):
                        sys.exit()

                    ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                    Listtarget_timeseries_start.append(target_timeseries_start)
                    Listtarget_timeseries_end.append(target_timeseries_end)
                    Listpre_background_start.append(pre_background_start)
                    Listpre_background_end.append(pre_background_end)
                    Listfol_background_start.append(fol_background_start)
                    Listfol_background_end.append(fol_background_end)
                    Listgpstime.append(gpstime)
                    Listduration.append(duration)
                    ListSNR.append(SNR)
                    Listconfi.append(confi)
                    ListID.append(ID)
                else: # the case where only a preceding is available
                    try:
                        # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
                        neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
                        if neareset_gap_pre >= duration:  # if the smallest gap is larger than the duration
                            if neareset_gap_pre > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                                neareset_gap_pre = 10 * duration  # replace it with the 10 times of the duration
                            else:
                                pass
                            if duration < 10:  # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                                pre_background_start = pre_background_end - neareset_gap_pre  # replace
                            else:
                                pass
                    except IndexError:  # there is no nearest glitch that satisfies the above condition
                        pass

                    print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, 0, duration))
                    if round(pre_background_end - pre_background_start, 3) < round(duration, 3) :
                        sys.exit()

                    ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                    Listtarget_timeseries_start.append(target_timeseries_start)
                    Listtarget_timeseries_end.append(target_timeseries_end)
                    Listpre_background_start.append(pre_background_start)
                    Listpre_background_end.append(pre_background_end)
                    Listfol_background_start.append(0) # fill zero
                    Listfol_background_end.append(0) # fill zero
                    Listgpstime.append(gpstime)
                    Listduration.append(duration)
                    ListSNR.append(SNR)
                    Listconfi.append(confi)
                    ListID.append(ID)
            else:
                if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where only a following is available
                    try:
                        # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
                        neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
                        if neareset_gap_fol >= duration:  # if the smallest gap is larger than the duration
                            if neareset_gap_fol > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                                neareset_gap_fol = 10 * duration  # replace it with the 10 times of the duration
                            else:
                                pass
                            if duration < 10:  # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                                fol_background_end = fol_background_start + neareset_gap_fol  # replace
                            else:
                                pass
                    except IndexError:  # there is no nearest glitch that satisfies the above condition
                        pass

                    print('{:.1f}, {:.1f}, {:.1f}'.format(0, fol_background_end - fol_background_start, duration))
                    if round(fol_background_end - fol_background_start, 3) < round(duration, 3):
                        sys.exit()

                    ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                    Listtarget_timeseries_start.append(target_timeseries_start)
                    Listtarget_timeseries_end.append(target_timeseries_end)
                    Listpre_background_start.append(0) # fill zero
                    Listpre_background_end.append(0) # fill zero
                    Listfol_background_start.append(fol_background_start)
                    Listfol_background_end.append(fol_background_end)
                    Listgpstime.append(gpstime)
                    Listduration.append(duration)
                    ListSNR.append(SNR)
                    Listconfi.append(confi)
                    ListID.append(ID)
                else: # the case where both are NOT available
                    print('Both a preceding and following are NOT available')
                    continue

    # zipped list of segments
    Listsegments = list(
        zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
            Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
            ListSNR, Listconfi, ListID))
    return Listsegments




[docs]def Findglitchlist_for_timeseries_analysis(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'):
    '''
    description:
        1. load Gravity Spy data set (.csv file)
        2. get the data about the target glitch class
        3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
        4. accept glitches where their back ground segment do not coincide with any other glitches
        5. return the info of the accepted glitches
    USAGE: Listsegments = Findglitchlist_for_timeseries_analysis(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'):

    :param df: GravitySpy meta data in pandas format
    :param Epochstart: starting time of an epoch
    :param Epochend:  end time of an epoch
    :param Commissioning_lt: commissioning time in list
    :param TargetGlitchClass: a target glitch class name (str)
    :param IFO: a type of interferometer (H1, L1, V1)  (str)
    :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
    :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
    :param Confidence_thre: a threshold of confidence level (float or int )
    :param UpperDurationThresh: an upper bound of duration in sec (float or int)
    :param LowerDurationThresh: a lower bound of duration in sec (float or int)
    :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
    :param gap: a time gap between the target and the background segments in sec, 1 sec in default
    :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
    :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG  for target glitches
    :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
    :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
    :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
    :return: the list of parameters of glitches passing the above thresholds
        Listsegments contains of
            ListIndexSatisfied: a list of index of glitches
            Listtarget_timeseries_start: a list of a target glitch starting time
            Listtarget_timeseries_end: a list of a target glitch ending time
            Listpre_background_start: a list of preceding background starting time
            Listpre_background_end; a list of preceding background ending time
            Listfol_background_start: a list of following background starting time
            Listfol_background_end: a list of following background ending time
            Listgpstime: a list of GPS times
            Listduration: a list of durations
            ListSNR: a list of SNRs
            Listconfi: a list of confidence levels
            ListID: a list of IDs
    '''


    df_list = []
    # take the data in an Epoch
    for Epoch in Epoch_lt:
        StartT, EndT = Epoch[0], Epoch[1]
        df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
        df_list.append(df_epoch)
    df_int = pd.concat(df_list)


    #### improvement for duration 2019-04-05 #############
    #### full durations are too long for capturing the glitch signature so that the solution is to choose T90
    ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
    FractionEnergy = 1. # (90% of energy )
    # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
    # df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration'])
    df_int = df_int.assign(gap=[gap] * df_int.shape[0])
    df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90


    # make the target segments
    # I reserve duration +/- gap around a GPS time of a glitch

    if UserDefinedDuration > 0: # accept a user defined duration
        # redefine only the duration of the target glitch
        df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration
    else:
        pass

    df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap'])
    df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap'])


    # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
    df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15


    # data set containing only a target glitch class
    try: # gravity spy and omicron trigger mode
        # for cWB triggers
        #df_glitch = df_int[(df_int.rho >= 10) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff)]
        df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    except KeyError: # pyCBC mode as there is no column "peakFreq"
        df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    print('all the glitches in the class: {}'.format(df_glitch.shape[0]))


    # the list of values satisfied as secure
    ListIndexSatisfied = []
    Listtarget_timeseries_start = []
    Listtarget_timeseries_end = []
    Listpre_background_start = []
    Listpre_background_end = []
    Listfol_background_start = []
    Listfol_background_end = []
    Listgpstime = []
    Listduration = []
    ListSNR = []
    Listconfi = []
    ListID = []
    for ind in range(len(df_glitch)):

        # target glitch's peak GPS time and the duration
        gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
        duration = df_glitch['duration'].iloc[ind] # # a duration
        SNR = df_glitch['snr'].iloc[ind] # snr
        confi = df_glitch['confidence'].iloc[ind] # confidence level
        ID = df_glitch['id'].iloc[ind] # ID
        gap_individual = df_glitch['gap'].iloc[ind]

        if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec
            pass # go to the next line
        else: # if the duration of a glitch is less than 0.02 sec
            continue # go to the next iteration



        # the starting and ending time of the target glitch
        target_timeseries_start = gpstime - duration * position_duration_bfr_centr
        target_timeseries_end = target_timeseries_start + duration

        # take the background
        # the background is around the target glitch
        # preceding background, which is a nearest neighbor segment before target_timeseries_start
        # following background, which is a nearest neighbor segment after target_timeseries_end

        background_duration = 15*duration # duration of both the proceeding and following background
        if background_duration > 15: # reduce computational cost
            background_duration = 15
        else:
            pass
        pre_background_start = target_timeseries_start - gap_individual - background_duration
        pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual
        fol_background_start = target_timeseries_end + gap_individual
        fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration

        print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))
        ListIndexSatisfied.append(ind)  # append the index satisfied as secure
        Listtarget_timeseries_start.append(target_timeseries_start)
        Listtarget_timeseries_end.append(target_timeseries_end)
        Listpre_background_start.append(pre_background_start)
        Listpre_background_end.append(pre_background_end)
        Listfol_background_start.append(fol_background_start)
        Listfol_background_end.append(fol_background_end)
        Listgpstime.append(gpstime)
        Listduration.append(duration)
        ListSNR.append(SNR)
        Listconfi.append(confi)
        ListID.append(ID)

    # zipped list of segments
    Listsegments = list(
        zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
            Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
            ListSNR, Listconfi, ListID))
    return Listsegments


[docs]def FindRadomlistPoints(state, IFO, Epoch_lt, number_samples, step, outputMother_dir, df_target):
    '''
    description:
        1. within an epoch, create  a list of synthetic points with randomly chosen with durations following a distribution of that of a target glitch class
        2. make pandas frame dataset
    USAGE: df = FindRadomlistPoints(state, Epoch_lt, number_trials, step)

    :param state: IFO state {observing, nominal-lock}
    :param IFO: an observer {H1, L1}
    :param Epoch_lt: a list of epochs
    :param number_samples: number of samples picked up
    :param step: step of data points in sec
    :param outputMother_dir: an output directory in witch the data set is in
    :param df_target: true glitch samples generated by an ETG with SNR above an upper threshold of background
    :return: df: synthetic random data points within an epoch with durations generated from a distribution of a target glitch
    '''
    # -----------------------------------------------------------------------------------------------------+
    # get a list of start and end GPS times of an epoch with a given state {'observing', 'nominal-lock'}   |
    # use IdentifyGlitch.FindObservingTimeSegments()                                                       |
    # -----------------------------------------------------------------------------------------------------+
    # get a glitch identifier object
    indentify = IdentifyGlitch()
    # get start time and end time of an epoch
    startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1]
    # get observing segments
    SegmentsMat, trigger_dir = indentify.FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state)
    if state == 'nominal-lcok':
        SegmentsMat[:,0] = SegmentsMat[:, 0] + 300 # remove 5 min after the observing mode begins
        SegmentsMat[:,1] = SegmentsMat[:, 1] - 60 # remove 1 min before the observing mode ends
    elif state == 'observing':
        pass
    total_seg_used = (SegmentsMat[:,1] - SegmentsMat[:,0]).sum()
    if number_samples > int(total_seg_used/float(step)): # the number of samples greater than the maximum number of possible samples
        number_samples = int(total_seg_used/float(step)) # set number of samples equal to the maximum number of possible samples
        print('User selected number of synthetic samples is greater than the maximum number of possible samples')
        print('Set the number of samples equal to the maximum number = {}'.format(number_samples))
    #-----------------------------------------------------+
    # generate synthetic list of center times of samples  |
    #-----------------------------------------------------+
    whole_list_center_times = np.array([])
    for start_seg_used, end_seg_used in SegmentsMat:
        length_seg_used = end_seg_used - start_seg_used # length of a segment in sec
        number_picked = int(number_samples * length_seg_used / float(total_seg_used)) # number of points to be selected in this segment
        # chop this segment by a given time step, then select "number_picked" points out of "length_seg_used/step" points, without duplication
        selected_index_center_times = np.random.choice(int(length_seg_used/float(step)), number_picked+1, replace=False) # 1 comes to pick samples a little more than the request at this moment as the sample size might be smaller than than the setting
        # calculate a list of center times
        list_center_times = np.array([start_seg_used + step * selected_index_center_time for selected_index_center_time in selected_index_center_times])
        # sort this list in ascendingly
        list_center_times = np.sort(list_center_times)
        # append this list of center times to the whole list
        whole_list_center_times = np.append(whole_list_center_times, list_center_times)

    # whole_list_center_times = whole_list_center_times[:number_samples] # make the sample size equal to the request (take the first chunk)
    # randomly choose the timestamps where the size of the sample equal to the request
    whole_list_center_times = np.random.choice(whole_list_center_times, number_samples, replace=False)
    # ascending order
    whole_list_center_times = np.sort(whole_list_center_times)
    #--------------------------------------+
    # generate synthetic list of durations |
    #--------------------------------------+
    duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.02 s
    duration_bin_width = 0.01 # the bin width of duration is set to 0.01
    Bins = int((df_target.duration.max() - duration_min) / duration_bin_width) # the number of bins such that the bin width = 0.01
    # make a normalized histogram of durations
    Count, Edge = np.histogram(df_target[(df_target.duration > duration_min) & (df_target.duration <= df_target.duration.max())].duration, bins=Bins, density=True)
    d_Edge = Edge[1] - Edge[0] # bin width
    # In gereral, the distribuiton of duration has long tail in a large value
    # make the sum of area of distribution equal to 1 exactly to generate synthetic set of durations
    Count[-1] = Count[-1] - (1 - np.sum(Count * d_Edge)) # correct the last element of this distribution
    Center_bin = Edge[:-1] + d_Edge * 0.5 # center values of each bin
    # if some of the element shows negative like -1.11022302e-16
    Count = np.where(Count >= 0, Count, 0)
    # generate a synthetic set of durations from the distribution that real durations
    gen_d = np.random.choice(Center_bin, number_samples, p=Count*d_Edge)
    # make a pandas dataframe that is compatible with a standard format in my program
    df = pd.DataFrame()
    df = df.assign(GPStime=whole_list_center_times)
    df = df.assign(startGPStime=np.zeros(number_samples))
    df = df.assign(peakFreq=np.zeros(number_samples))
    df = df.assign(snr=np.zeros(number_samples))
    df = df.assign(amplitude=np.zeros(number_samples))
    df = df.assign(centralFreq=np.zeros(number_samples))
    df = df.assign(duration=gen_d)
    df = df.assign(bandwidth=np.zeros(number_samples))
    df = df.assign(chisq=np.zeros(number_samples))
    df = df.assign(chisqDof=np.zeros(number_samples))
    df = df.assign(confidence=np.zeros(number_samples))
    df = df.assign(id=[None] * number_samples)
    df = df.assign(ifo=['{}'.format(IFO)] * number_samples)
    df = df.assign(label=['null'] * number_samples)
    df = df.assign(imgUrl=[None] * number_samples)
    output_path = os.path.join(trigger_dir, 'RandomSample.csv')
    df.to_csv(output_path) # save the data sample
    df = pd.read_csv(output_path) # load the file
    df.loc[:, 'label'] = 'null' # this is py3 issue, the label of 'null' is read as NaN
    return df



[docs]def FindShiftedPoints(state, IFO, Epoch_lt, number_samples, step, outputMother_dir, df_target):
    '''
    description:
        1. within an epoch, create  a list of synthetic points by shifting a target glitch class
        2. make pandas frame dataset
    USAGE: df = FindRadomlistPoints(state, Epoch_lt, number_trials, step)

    :param state: IFO state {observing, nominal-lock}
    :param IFO: an observer {H1, L1}
    :param Epoch_lt: a list of epochs
    :param number_samples: number of samples picked up
    :param step: step of data points in sec
    :param outputMother_dir: an output directory in witch the data set is in
    :param df_target: true glitch samples generated by an ETG with SNR above an upper threshold of background
    :return: df: synthetic random data points within an epoch with durations generated from a distribution of a target glitch
    '''
    # -----------------------------------------------------------------------------------------------------+
    # get a list of start and end GPS times of an epoch with a given state {'observing', 'nominal-lock'}   |
    # use IdentifyGlitch.FindObservingTimeSegments()                                                       |
    # -----------------------------------------------------------------------------------------------------+
    # get a glitch identifier object
    indentify = IdentifyGlitch()
    # get start time and end time of an epoch
    startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1]
    # get observing segments
    SegmentsMat, trigger_dir = indentify.FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state)
    if state == 'nominal-lcok':
        SegmentsMat[:,0] = SegmentsMat[:, 0] + 300 # remove 5 min after the observing mode begins
        SegmentsMat[:,1] = SegmentsMat[:, 1] - 60 # remove 1 min before the observing mode ends
    elif state == 'observing':
        pass
    df_target = df_target.assign(GPStime=df_target.GPStime.values + 10) # intentionally shift the trigger times by 10 s
    df_target = df_target.assign(snr=[0]*df_target.shape[0])
    df_target = df_target.assign(label=['null'] * df_target.shape[0])
    df_target = df_target.assign(id=[None] * df_target.shape[0])
    df_target = df_target.assign(imgUrl=[None] * df_target.shape[0])
    df_list = []
    for Seg_start, Seg_end in SegmentsMat:
        df_list.append(df_target[(df_target.GPStime >= Seg_start) & (df_target.GPStime <= Seg_end)])
    df = pd.concat(df_list)
    df = df.reset_index(drop=True)
    output_path = os.path.join(trigger_dir, 'RandomSample.csv')
    df.to_csv(output_path) # save the data sample
    df = pd.read_csv(output_path) # load the file
    return df



[docs]def FindBGlist(state, number_trials, step, outputMother_dir, df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf, flag='Both'):
    '''
    description:
        1. load Gravity Spy data set (.csv file)
        2. get the data about the target glitch class
        3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
        4. accept glitches where their back ground segment do not coincide with any other glitches
        5. return the info of the accepted glitches
    USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, flag)

    :param df: GravitySpy meta data in pandas format
    :param Epochstart: starting time of an epoch
    :param Epochend:  end time of an epoch
    :param Commissioning_lt: commissioning time in list
    :param TargetGlitchClass: a target glitch class name (str)
    :param IFO: a type of interferometer (H1, L1, V1)  (str)
    :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
    :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
    :param Confidence_thre: a threshold of confidence level (float or int )
    :param UpperDurationThresh: an upper bound of duration in sec (float or int)
    :param LowerDurationThresh: a lower bound of duration in sec (float or int)
    :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
    :param gap: a time gap between the target and the background segments in sec, 1 sec in default
    :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
    :return: the list of parameters of glitches passing the above thresholds
        Listsegments contains of
            ListIndexSatisfied: a list of index of glitches
            Listtarget_timeseries_start: a list of a target glitch starting time
            Listtarget_timeseries_end: a list of a target glitch ending time
            Listpre_background_start: a list of preceding background starting time
            Listpre_background_end; a list of preceding background ending time
            Listfol_background_start: a list of following background starting time
            Listfol_background_end: a list of following background ending time
            Listgpstime: a list of GPS times
            Listduration: a list of durations
            ListSNR: a list of SNRs
            Listconfi: a list of confidence levels
            ListID: a list of IDs
    '''

    df_list = []
    # take the data in an Epoch
    for Epoch in Epoch_lt:
        StartT, EndT = Epoch[0], Epoch[1]
        df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
        df_list.append(df_epoch)
    df_int = pd.concat(df_list)

    # data set containing only a target glitch class
    try:  # gravity spy and omicron trigger mode
        df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    except KeyError:  # pyCBC mode as there is no column "peakFreq"
        df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
    # generate synthetic data points
    df_random = FindRadomlistPoints(state, IFO, Epoch_lt, number_trials, step,  outputMother_dir, df_target[df_target.snr > BGSNR_thre])
    #df_random = FindShiftedPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre])

    df_int = df_int.append(df_random, ignore_index=True) # append the dataset of random points

    #### improvement for duration 2019-07-02 #############
    #### full durations are too long for capturing the glitch signature so that the solution is to choose T90
    ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
    ### This function is for getting quite segments so that I concervatively choose the longest fraction of duration, in default
    FractionEnergy = 1  # (100% of energy )
    # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
    df_int = df_int.assign(gap=gap + (1 - FractionEnergy) / 2. * df_int['duration'])
    df_int['duration'] = np.round(FractionEnergy * df_int['duration'], 3)  # replace full durations (T100) with T90
    # make the time segments of glitches measured
    # I reserve duration +/- gap around a GPS time of a glitch

    if UserDefinedDuration > 0:  # accept a user defined duration
        # assign this user defined duration for any types of glitches observed in a given IFO
        df_int.loc[(df_int['ifo'] == IFO), ['duration']] = UserDefinedDuration
    else:  # otherwise, keep the duration that are determined by an event trigger generator (ETG)
        pass

    df_int = df_int.assign(reserved_time_start=df_int['GPStime'] - df_int['duration'] / 2. - df_int['gap'])
    df_int = df_int.assign(reserved_time_end=df_int['GPStime'] + df_int['duration'] / 2. + df_int['gap'])

    # data set of randomly chosen background
    df_glitch = df_int[df_int.label == 'null']  # synthetic background trigger has SNR of zero


    # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
    df_int = df_int[df_int['snr'] > BGSNR_thre]

    print('all the glitches in the class: {}'.format(df_glitch.shape[0]))

    # the list of values satisfied as secure
    ListIndexSatisfied = []
    Listtarget_timeseries_start = []
    Listtarget_timeseries_end = []
    Listpre_background_start = []
    Listpre_background_end = []
    Listfol_background_start = []
    Listfol_background_end = []
    Listgpstime = []
    Listduration = []
    ListSNR = []
    Listconfi = []
    ListID = []
    for ind in range(len(df_glitch)):

        # target glitch's peak GPS time and the duration
        gpstime = df_glitch['GPStime'].iloc[ind]  # a peak GPS time
        duration = df_glitch['duration'].iloc[ind]  # # a duration
        SNR = df_glitch['snr'].iloc[ind]  # snr
        confi = df_glitch['confidence'].iloc[ind]  # confidence level
        ID = df_glitch['id'].iloc[ind]  # ID
        gap_individual = df_glitch['gap'].iloc[ind]

        if duration > 0.02:  # accept the duration of glitch is greater than 0.02 sec
            pass  # go to the next line
        else:  # if the duration of a glitch is less than 0.01 sec
            continue  # go to the next iteration

        # the starting and ending time of the target glitch
        # at this moment I assume that the target glitch is distributed evenly around the GPS time
        target_timeseries_start = gpstime - duration / 2.
        target_timeseries_end = target_timeseries_start + duration

        # take the background
        # the background is around the target glitch
        # preceding background, which is a nearest neighbor segment before target_timeseries_start
        # following background, which is a nearest neighbor segment after target_timeseries_end

        background_duration = duration  # duration of both the proceeding and following background
        pre_background_start = target_timeseries_start - gap_individual - background_duration
        pre_background_end = pre_background_start + background_duration  # target_timeseries_start - gap_individual
        fol_background_start = target_timeseries_end + gap_individual
        fol_background_end = fol_background_start + background_duration  # target_timeseries_end + gap_individual + background_duration

        # security check of the preceding background
        # case 1 is where the background is coincident with the right edge of the reserved segment
        # case 2 is where the background is coincident with the left edge of the reserved segment
        # case 3 is where the reserved segment is inside of the background
        # case 4 is where the background is inside of the reserved segment
        # it violates the security of the background estimation if at least one of the cases occurs
        # i.e., pro_case1, 2, 3, and 4 should be zero

        pre_case1 = len(df_int[(df_int['reserved_time_end'] > pre_background_start) & (
                    df_int['reserved_time_end'] < pre_background_end)])  # case 1
        pre_case2 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (
                    df_int['reserved_time_start'] < pre_background_end)])  # case 2
        pre_case3 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (
                    df_int['reserved_time_end'] < pre_background_end)])  # case 3
        pre_case4 = len(df_int[(df_int['reserved_time_start'] < pre_background_start) & (
                    df_int['reserved_time_end'] > pre_background_end)])  # case 4

        # security check of the following background
        fol_case1 = len(df_int[(df_int['reserved_time_end'] > fol_background_start) & (
                    df_int['reserved_time_end'] < fol_background_end)])  # case 1
        fol_case2 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (
                    df_int['reserved_time_start'] < fol_background_end)])  # case 2
        fol_case3 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (
                    df_int['reserved_time_end'] < fol_background_end)])  # case 3
        fol_case4 = len(df_int[(df_int['reserved_time_start'] < fol_background_start) & (
                    df_int['reserved_time_end'] > fol_background_end)])  # case 4

        if flag == 'Both':  # take both preceding and following back grounds
            if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0 and fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0:
                try:
                    # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
                    neareset_gap_pre = (pre_background_start -
                                        df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
                    if neareset_gap_pre >= duration:  # if the smallest gap is larger than the duration
                        if neareset_gap_pre > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                            neareset_gap_pre = 10 * duration  # replace it with the 10 time of the duration
                        else:
                            pass
                        if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                            pre_background_start = pre_background_end - neareset_gap_pre  # replace
                        else:
                            pass
                except IndexError:  # there is no nearest glitch that satisfies the above condition
                    pass

                try:
                    # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
                    neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
                    if neareset_gap_fol >= duration:  # if the smallest gap is larger than the duration
                        if neareset_gap_fol > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                            neareset_gap_fol = 10 * duration  # replace it with the 10 time of the duration
                        else:
                            pass
                        if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                            fol_background_end = fol_background_start + neareset_gap_fol  # replace
                        else:
                            pass
                except IndexError:  # there is no nearest glitch that satisfies the above condition
                    pass

                # #print('{:.1f}, {:.1f}, {:.1f}'.format(neareset_gap_pre, neareset_gap_fol, duration))
                print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))
                if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(fol_background_end - fol_background_start, 3) < round(duration, 3):
                    sys.exit()

                ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                Listtarget_timeseries_start.append(target_timeseries_start)
                Listtarget_timeseries_end.append(target_timeseries_end)
                Listpre_background_start.append(pre_background_start)
                Listpre_background_end.append(pre_background_end)
                Listfol_background_start.append(fol_background_start)
                Listfol_background_end.append(fol_background_end)
                Listgpstime.append(gpstime)
                Listduration.append(duration)
                ListSNR.append(SNR)
                Listconfi.append(confi)
                ListID.append(ID)
            else:
                continue
        if flag == 'Either':  # accept if either a preceding or a following background is available
            if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0:
                if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0:  # the case where both a proceeding and a following are available
                    try:
                        # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
                        neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
                        if neareset_gap_pre >= duration:  # if the smallest gap is larger than the duration
                            if neareset_gap_pre > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                                neareset_gap_pre = 10 * duration  # replace it with the 10 times of the duration
                            else:
                                pass
                            if duration < 10:  # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                                pre_background_start = pre_background_end - neareset_gap_pre  # replace
                            else:
                                pass
                    except IndexError:  # there is no nearest glitch that satisfies the above condition
                        pass
                    try:
                        # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
                        neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
                        if neareset_gap_fol >= duration:  # if the smallest gap is larger than the duration
                            if neareset_gap_fol > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                                neareset_gap_fol = 10 * duration  # replace it with the 10 times of the duration
                            else:
                                pass
                            if duration < 10:  # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                                fol_background_end = fol_background_start + neareset_gap_fol  # replace
                            else:
                                pass
                    except IndexError:  # there is no nearest glitch that satisfies the above condition
                        pass
                    ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                    Listtarget_timeseries_start.append(target_timeseries_start)
                    Listtarget_timeseries_end.append(target_timeseries_end)
                    Listpre_background_start.append(pre_background_start)
                    Listpre_background_end.append(pre_background_end)
                    Listfol_background_start.append(fol_background_start)
                    Listfol_background_end.append(fol_background_end)
                    Listgpstime.append(gpstime)
                    Listduration.append(duration)
                    ListSNR.append(SNR)
                    Listconfi.append(confi)
                    ListID.append(ID)
                else:  # the case where only a proceeding is available
                    try:
                        # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
                        neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
                        if neareset_gap_pre >= duration:  # if the smallest gap is larger than the duration
                            if neareset_gap_pre > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                                neareset_gap_pre = 10 * duration  # replace it with the 10 times of the duration
                            else:
                                pass
                            if duration < 10:  # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                                pre_background_start = pre_background_end - neareset_gap_pre  # replace
                            else:
                                pass
                    except IndexError:  # there is no nearest glitch that satisfies the above condition
                        pass
                    ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                    Listtarget_timeseries_start.append(target_timeseries_start)
                    Listtarget_timeseries_end.append(target_timeseries_end)
                    Listpre_background_start.append(pre_background_start)
                    Listpre_background_end.append(pre_background_end)
                    Listfol_background_start.append(0)  # fill zero
                    Listfol_background_end.append(0)  # fill zero
                    Listgpstime.append(gpstime)
                    Listduration.append(duration)
                    ListSNR.append(SNR)
                    Listconfi.append(confi)
                    ListID.append(ID)
            else:
                if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0:  # the case where only a following is available
                    try:
                        # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
                        neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
                        if neareset_gap_fol >= duration:  # if the smallest gap is larger than the duration
                            if neareset_gap_fol > 10 * duration:  # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
                                neareset_gap_fol = 10 * duration  # replace it with the 10 times of the duration
                            else:
                                pass
                            if duration < 10:  # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
                                fol_background_end = fol_background_start + neareset_gap_fol  # replace
                            else:
                                pass
                    except IndexError:  # there is no nearest glitch that satisfies the above condition
                        pass
                    ListIndexSatisfied.append(ind)  # append the index satisfied as secure
                    Listtarget_timeseries_start.append(target_timeseries_start)
                    Listtarget_timeseries_end.append(target_timeseries_end)
                    Listpre_background_start.append(0)  # fill zero
                    Listpre_background_end.append(0)  # fill zero
                    Listfol_background_start.append(fol_background_start)
                    Listfol_background_end.append(fol_background_end)
                    Listgpstime.append(gpstime)
                    Listduration.append(duration)
                    ListSNR.append(SNR)
                    Listconfi.append(confi)
                    ListID.append(ID)
                else:  # the case where both are NOT available
                    print('Both a preceding and following are NOT available')
                    continue

    # zipped list of segments
    Listsegments = list(
        zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
            Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
            ListSNR, Listconfi, ListID))
    return Listsegments





[docs]def TimeShiftingSamplePrecedingBGonly(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'):
    '''
    description:
        1. load Gravity Spy data set (.csv file)
        2. get the data about the target glitch class
        3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
        4. accept glitches where their back ground segment do not coincide with any other glitches
        5. return the info of the accepted glitches
    USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, gap, flag)

    :param df: GravitySpy meta data in pandas format
    :param Epochstart: starting time of an epoch
    :param Epochend:  end time of an epoch
    :param Commissioning_lt: commissioning time in list
    :param TargetGlitchClass: a target glitch class name (str)
    :param IFO: a type of interferometer (H1, L1, V1)  (str)
    :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
    :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
    :param Confidence_thre: a threshold of confidence level (float or int )
    :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
    :param gap: a time gap between the target and the background segments in sec, 1 sec in default
    :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
    :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG  for target glitches
    :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
    :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
    :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
    :return: the list of parameters of glitches passing the above thresholds
        Listsegments contains of
            ListIndexSatisfied: a list of index of glitches
            Listtarget_timeseries_start: a list of a target glitch starting time
            Listtarget_timeseries_end: a list of a target glitch ending time
            Listpre_background_start: a list of preceding background starting time
            Listpre_background_end; a list of preceding background ending time
            Listfol_background_start: a list of following background starting time
            Listfol_background_end: a list of following background ending time
            Listgpstime: a list of GPS times
            Listduration: a list of durations
            ListSNR: a list of SNRs
            Listconfi: a list of confidence levels
            ListID: a list of IDs
    '''

    # get start time and end time of an epoch
    startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1]
    Number_samples = int(endT - startT)
    pre_background_start_initial = startT
    Target_duration = 1
    BG_duration = 10
    pre_background_end_initial = pre_background_start_initial + BG_duration
    target_timeseries_start_initial = pre_background_end_initial


    # the list of values satisfied as secure
    ListIndexSatisfied = range(Number_samples)
    Listtarget_timeseries_start = [target_timeseries_start_initial + Target_duration * ind for ind in ListIndexSatisfied]
    Listtarget_timeseries_end = [target_timeseries_start + Target_duration for target_timeseries_start in Listtarget_timeseries_start]
    Listpre_background_start = [pre_background_start_initial + Target_duration * ind for ind in ListIndexSatisfied]
    Listpre_background_end = [pre_background_start + BG_duration for pre_background_start in Listpre_background_start]
    Listfol_background_start = [0 for ind in ListIndexSatisfied]
    Listfol_background_end = [0 for ind in ListIndexSatisfied]
    Listgpstime = [target_timeseries_start + 0.5 * Target_duration for target_timeseries_start in Listtarget_timeseries_start]
    Listduration = [1 for ind in ListIndexSatisfied]
    ListSNR = [10 for ind in ListIndexSatisfied]
    Listconfi = [0 for ind in ListIndexSatisfied]
    ListID = ['None' for ind in ListIndexSatisfied]

    # zipped list of segments
    Listsegments = list(
        zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
            Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
            ListSNR, Listconfi, ListID))
    return Listsegments





#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#take time series of safe channels and calculate the absolute values of whitened FFT
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

[docs]def select_some_trials(iterate, maximum_iterate=5):
    '''
    description:
        this function is used to reduce the number of trials for the off-source window
    USAGE: list_index_trials = select_some_trials(iterate, maximum_iterate=5)

    :param iterate: a number of trials
    :param maximum_iterate: a maximum number of trials
    :return: a randomly chosen trials where the total number is maximum_iteration or less
    '''
    if iterate > maximum_iterate:
        list_index_trials = np.random.choice(range(iterate), maximum_iterate, replace=False)
    else:
        list_index_trials = np.arange(iterate).astype(int)
    return list_index_trials


[docs]def Multiprocess_whitening(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end):
    '''
    description:
        This is used for multi processing for whitening segments
    :param full_timeseries: time series comprising target and BGs
    :param target_timeseries_start: a start time of a target segment
    :param target_timeseries_end: an end time of a target segment
    :param pre_background_start: a start time of a preceding BG
    :param pre_background_start: an end time of a preceding BG
    :param pre_background_start: a start time of a following BG
    :param pre_background_start: an end time of a following BG
    :return:    whitened_fft_target: whitened fft of a target segment
                whitened_fft_PBG: whitened fft of a preceding segment
                whitened_fft_FBG: whitened fft of a following segment
                sample_rate: sampling rate of this channel
                DURATION: a duration of a target segment
    '''
    sample_rate = full_timeseries.sample_rate.value
    DURATION = target_timeseries_end - target_timeseries_start
    fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10  # fourth decimal number
    DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001  # truncate at the fourth decimal number
    # revise the duration to make an error in whitening process not not to occur
    # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
    third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
    if third_decimal_num_of_duration % 2 == 1:  # odd number at the third decimal point
        DURATION = DURATION - 0.001  # make the duration with an even number at the third decimal point
    elif third_decimal_num_of_duration % 2 == 0:  # even number at the third decimal point
        pass  # keep the original duration as it is
    DURATION = round(DURATION, 3)  # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
    pre_background_duration = pre_background_end - pre_background_start
    fol_background_duration = fol_background_end - pre_background_end
    pre_background_asd_weight = pre_background_duration / (pre_background_duration + fol_background_duration)
    fol_background_asd_weight = fol_background_duration / (pre_background_duration + fol_background_duration)
    full_timeseries_start = full_timeseries.span[0]
    full_timeseries_end = full_timeseries.span[1]
    del pre_background_duration, fol_background_duration
    if pre_background_start != 0 and fol_background_start != 0:  # if both BGs are available
        # retrieve the preceding and following backgrounds
        if pre_background_start < full_timeseries_start: # if the start time of the preceding BG is before that of the full time series
            pre_background = full_timeseries.crop(end=pre_background_end) # only specify the end time of cropping
        else: # otherwise
            pre_background = full_timeseries.crop(start=pre_background_start, end=pre_background_end) # specify both
        if fol_background_end > full_timeseries_end: # if the end time of the following BG is after that of the full time series
            fol_background = full_timeseries.crop(start=fol_background_start) # specify only the stat time of cropping
        else: # otherwise
            fol_background = full_timeseries.crop(start=fol_background_start, end=fol_background_end) # specify both
        # average background ASD
        try:
            pre_background_asd = pre_background.asd(DURATION, method='welch')  # ASD of the preceding BG
            fol_background_asd = fol_background.asd(DURATION, method='welch')  # ASD of the following BG
        except ValueError:
            raise ValueError
        ave_background_asd = pre_background_asd_weight*pre_background_asd + fol_background_asd_weight*fol_background_asd # weighed average of those two BG ASD
        ave_background_asd = ave_background_asd.value # convert numpy array
        del pre_background_asd_weight, fol_background_asd_weight
        # for the preceding BG
        if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment
            # number of BG trials
            iterate = int(pre_background.duration.value / DURATION)
            # redefine the start and end times of the preceding BG to avoid cropping warning
            pre_background_start, pre_background_end = pre_background.span[0], pre_background.span[1]
            ## --------cropping from the starting time------
            # take some fraction of trials
            list_index_trials = select_some_trials(iterate)
            crop_start_list = [pre_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments
            crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments
            # avoid the cropping warning
            crop_end_list = [crop_end if crop_end < pre_background_end else pre_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger
            # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
            # calculate each whitened FFT
            whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
            del crop_start_list, crop_end_list # delete redundant lists
            whitened_fft_PBG_forwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None)
            del whitened_fft_PBG_ind_list # delete redundant a list
            ## -----cropping from the ending time------
            # take some fraction of trials
            list_index_trials = select_some_trials(iterate)
            crop_end_list = [pre_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments
            crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments
            # avoid the cropping warning
            crop_start_list = [crop_start if crop_start > pre_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller
            # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
            whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
            del crop_start_list, crop_end_list
            whitened_fft_PBG_backwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None)
            del whitened_fft_PBG_ind_list  # delete redundant a list
            whitened_fft_PBG = np.append(whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding)
            del whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding  # delete redundant a list
        else: # if the preceding BG duration is the same as that of the target segment
            fft_PBG = pre_background.asd(DURATION, method='welch').value
            # whitening the preceding BG using the average BG
            whitened_fft_PBG = fft_PBG / ave_background_asd
        # for the following BG
        if fol_background.duration.value > DURATION:  # if the following BG duration is longer than that of the target segment
            # number of BG trials
            iterate = int(fol_background.duration.value / DURATION)
            # redefine the start and end times of the following BG to avoid cropping warning
            fol_background_start, fol_background_end = fol_background.span[0], fol_background.span[1]
            ## ------ cropping from the starting time ------
            # take some fraction of trials
            list_index_trials = select_some_trials(iterate)
            crop_start_list = [fol_background_start + i * DURATION for i in list_index_trials]  # a list of starting times cropped segments
            crop_end_list = [crop_start + DURATION for crop_start in crop_start_list]  # a list of ending times of cropped segments
            # avoid the cropping warning
            crop_end_list = [crop_end if crop_end < fol_background_end else fol_background_end for crop_end in crop_end_list]  # if end time is replaced with the end of the BG segments if it is larger
            # note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
            # calculate each whitened FFT
            whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
            del crop_start_list, crop_end_list  # delete redundant lists
            whitened_fft_FBG_forwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None)
            del whitened_fft_FBG_ind_list  # delete redundant a list
            ## ------ cropping from the ending time -----
            # take some fraction of trials
            list_index_trials = select_some_trials(iterate)
            crop_end_list = [fol_background_end - i * DURATION for i in list_index_trials]  # a list of ending times of cropped segments
            crop_start_list = [crop_end - DURATION for crop_end in crop_end_list]  # a list of starting times cropped segments
            # avoid the cropping warning
            crop_start_list = [crop_start if crop_start > fol_background_start else pre_background_start for crop_start in crop_start_list]  # if start time is replaced with the start of the BG segments if it is smaller
            # note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
            whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
            del crop_start_list, crop_end_list
            whitened_fft_FBG_backwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None)
            del whitened_fft_FBG_ind_list # delete redundant a list
            whitened_fft_FBG = np.append(whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding)
            del whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding   # delete redundant a list
        else: # if the following BG duration is the same as that of the target segment
            fft_FBG = fol_background.asd(DURATION, method='welch').value
            # whitening the following BG using the average BG
            whitened_fft_FBG = fft_FBG / ave_background_asd
    elif pre_background_start != 0 and fol_background_start == 0:  # if there is an available preceding background
        # retrieve the preceding and following backgrounds
        if pre_background_start < full_timeseries_start:
            pre_background_start = full_timeseries_start
        pre_background = full_timeseries.crop(start=pre_background_start, end=pre_background_end)
        # fft of a preceding BG
        try:
            pre_background_asd = pre_background.asd(DURATION)  # ASD of the preceding BG
        except ValueError:
            raise ValueError
        ave_background_asd = pre_background_asd  # average of those two BG ASD
        ave_background_asd = ave_background_asd.value # convert it to numpy array
        # for the preceding BG
        if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment
            # number of BG trials
            iterate = int(pre_background.duration.value / DURATION)
            # redefine the start and end times of the preceding BG to avoid cropping warning
            pre_background_start, pre_background_end = pre_background.span[0], pre_background.span[1]
            ## ----- cropping from the starting time -----
            # take some fraction of trials
            list_index_trials = select_some_trials(iterate)
            crop_start_list = [pre_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments
            crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments
            # avoid the cropping warning
            crop_end_list = [crop_end if crop_end < pre_background_end else pre_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger
            # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
            # calculate each whitened FFT
            whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
            del crop_start_list, crop_end_list # delete redundant lists
            whitened_fft_PBG_forwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None)
            del whitened_fft_PBG_ind_list # delete redundant a list
            ## ---- cropping from the ending time --------
            # take some fraction of trials
            list_index_trials = select_some_trials(iterate)
            crop_end_list = [pre_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments
            crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments
            # avoid the cropping warning
            crop_start_list = [crop_start if crop_start > pre_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller
            # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
            whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
            del crop_start_list, crop_end_list
            whitened_fft_PBG_backwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None)
            del whitened_fft_PBG_ind_list  # delete redundant a list
            whitened_fft_PBG = np.append(whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding)
            del whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding
        else: # if the preceding BG duration is the same as that of the target segment
            fft_PBG = pre_background.asd(DURATION, method='welch').value
            # whitening the preceding BG using the average BG
            whitened_fft_PBG = fft_PBG / ave_background_asd
        whitened_fft_FBG = None # it is not calculated
    elif pre_background_start == 0 and fol_background_start != 0:  # if there is an available following background
        # retrieve the preceding and following backgrounds
        if fol_background_end > full_timeseries_end:
            fol_background_end = full_timeseries_end
        fol_background = full_timeseries.crop(start=fol_background_start, end=fol_background_end)
        try:
            fol_background_asd = fol_background.asd(DURATION)  # ASD of the following BG
        except ValueError:
            raise ValueError
        ave_background_asd = fol_background_asd  # average of those two BG ASD
        ave_background_asd = ave_background_asd.value # convert it to numpy array
        whitened_fft_PBG = None # it is not calculated
        if fol_background.duration.value > DURATION:  # if the following BG duration is longer than that of the target segment
            # number of BG trials
            iterate = int(fol_background.duration.value / DURATION)
            # redefine the start and end times of the following BG to avoid cropping warning
            fol_background_start, fol_background_end = fol_background.span[0], fol_background.span[1]
            ## ----- cropping from the starting time ------
            # take some fraction of trials
            list_index_trials = select_some_trials(iterate)
            crop_start_list = [fol_background_start + i * DURATION for i in list_index_trials]  # a list of starting times cropped segments
            crop_end_list = [crop_start + DURATION for crop_start in crop_start_list]  # a list of ending times of cropped segments
            # avoid the cropping warning
            crop_end_list = [crop_end if crop_end < fol_background_end else fol_background_end for crop_end in crop_end_list]  # if end time is replaced with the end of the BG segments if it is larger
            # note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
            # calculate each whitened FFT
            whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
            del crop_start_list, crop_end_list  # delete redundant lists
            whitened_fft_FBG_forwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None)
            del whitened_fft_FBG_ind_list  # delete redundant a list
            ## ------ cropping from the ending time --------
            # take some fraction of trials
            list_index_trials = select_some_trials(iterate)
            crop_end_list = [fol_background_end - i * DURATION for i in list_index_trials]  # a list of ending times of cropped segments
            crop_start_list = [crop_end - DURATION for crop_end in crop_end_list]  # a list of starting times cropped segments
            # avoid the cropping warning
            crop_start_list = [crop_start if crop_start > fol_background_start else pre_background_start for crop_start in crop_start_list]  # if start time is replaced with the start of the BG segments if it is smaller
            # note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
            whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
            del crop_start_list, crop_end_list
            whitened_fft_FBG_backwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None)
            del whitened_fft_FBG_ind_list  # delete redundant a list
            whitened_fft_FBG = np.append(whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding)
            del whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding
        else: # if the following BG duration is the same as that of the target segment
            fft_FBG = fol_background.asd(DURATION, method='welch').value
            # whitening the following BG using the average BG
            whitened_fft_FBG = fft_FBG / ave_background_asd
    # whitening Target
    target_timeseries = full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end)
    fft_target = target_timeseries.asd(DURATION, method='welch').value # fft of the target
    # whitening the target using the average BG
    whitened_fft_target = fft_target / ave_background_asd
    return whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, DURATION




[docs]def Multiprocess_whitening_timeseries(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end):
    '''
    description:
        This is used for multi processing for whitening segments and puts out the absolute values of whitened timeseries of the on- and off-source windows
        instead of putting out of the frequency series.
        The the deviation of the whitened timeseries is imformative so that it calculates absolute values
        This function is used to study the metric evaluated in the time domain compared with the metric evaluated in the frequency domain.  In conclusion, the metric in the frequency domain is better so that this function is useless.
    USAGE: whitened_target_timeseries_abs, whitened_pre_off_source_abs, whitened_fol_off_source_abs, sample_rate, DURATION = Multiprocess_whitening_timeseries(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end)

    :param full_timeseries: time series comprising target and BGs
    :param target_timeseries_start: a start time of a target segment
    :param target_timeseries_end: an end time of a target segment
    :param pre_background_start: a start time of a preceding BG
    :param pre_background_start: an end time of a preceding BG
    :param pre_background_start: a start time of a following BG
    :param pre_background_start: an end time of a following BG
    :return:    whitened_target_timeseries_abs: the absolute values of whitened timeseries in the on-source window
                whitened_pre_off_source_abs: the absolute values of whitened timeseries in the preceding off-source window
                whitened_fol_off_source_abs: the absolute values of whitened timeseries in the preceding off-source window
                sample_rate: sampling rate of this channel
                DURATION: a duration of a target segment
    '''
    sample_rate = full_timeseries.sample_rate.value
    DURATION = target_timeseries_end - target_timeseries_start
    fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10  # fourth decimal number
    DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001  # truncate at the fourth decimal number
    # revise the duration to make an error in whitening process not not to occur
    # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
    third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
    if third_decimal_num_of_duration % 2 == 1:  # odd number at the third decimal point
        DURATION = DURATION - 0.001  # make the duration with an even number at the third decimal point
    elif third_decimal_num_of_duration % 2 == 0:  # even number at the third decimal point
        pass  # keep the original duration as it is
    DURATION = round(DURATION, 3)  # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
    pre_background_duration = pre_background_end - pre_background_start
    fol_background_duration = fol_background_end - pre_background_end
    pre_background_asd_weight = pre_background_duration / (pre_background_duration + fol_background_duration)
    fol_background_asd_weight = fol_background_duration / (pre_background_duration + fol_background_duration)
    pre_background_duration, fol_background_duration
    # -----------------
    try:
        pre_timeseries = full_timeseries.crop(end=pre_background_end)
        fol_timeseries = full_timeseries.crop(start=fol_background_start)
        pre_background_asd = pre_timeseries.asd(DURATION, method='welch')  # ASD of the preceding BG
        fol_background_asd = fol_timeseries.asd(DURATION, method='welch')  # ASD of the following BG
    except ValueError:
        raise ValueError
    ave_background_asd = pre_background_asd_weight * pre_background_asd + fol_background_asd_weight * fol_background_asd  # weighed average of those two BG ASD
    del pre_background_asd_weight, fol_background_asd_weight
    whitened_full_timeseries = full_timeseries.whiten(fftlength=DURATION, asd=ave_background_asd)
    #---------------------
    #whitened_full_timeseries = full_timeseries.whiten(fftlength=DURATION)
    if pre_background_start != 0 and fol_background_start != 0:  # if both BGs are available
        ################3
        whitened_pre_off_source = whitened_full_timeseries.crop(start=pre_background_start + 0.05 * pre_background_duration , end=pre_background_end)
        whitened_fol_off_source = whitened_full_timeseries.crop(start=fol_background_start, end=fol_background_end - 0.05 * fol_background_duration)
        # take absolute value
        whitened_pre_off_source_abs = whitened_pre_off_source.abs().value
        whitened_fol_off_source_abs = whitened_fol_off_source.abs().value
    elif pre_background_start != 0 and fol_background_start == 0:  # if there is an available preceding background
        whitened_pre_off_source = whitened_full_timeseries.crop(start=pre_background_start + 0.05 * pre_background_duration, end=pre_background_end)
        whitened_fol_off_source = None
        # take absolute value
        whitened_pre_off_source_abs = whitened_pre_off_source.abs().value
    elif pre_background_start == 0 and fol_background_start != 0:  # if there is an available following background
        whitened_pre_off_source = None
        whitened_fol_off_source = whitened_full_timeseries.crop(start=fol_background_start, end=fol_background_end - 0.05 * fol_background_duration)
        # take absolute value
        whitened_fol_off_source_abs = whitened_fol_off_source.abs().value
    # whitening Target
    whitened_target_timeseries = whitened_full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end)
    # take absolute value
    whitened_target_timeseries_abs = whitened_target_timeseries.abs().value
    return whitened_target_timeseries_abs, whitened_pre_off_source_abs, whitened_fol_off_source_abs, sample_rate, DURATION


#
# def Multiprocess_subtracting(target_timeseries, pre_background, fol_background):
#     '''
#     description:
#         This is used for multi processing for subtracting segments
#     :param ch_label: channel label
#     :param target_timeseries: time series of a target segment
#     :param pre_background: time series of a preceding BG
#     :param fol_background: time sereis of a following BG
#     :param pre_background_start: start time of a preceding BG
#     :param pre_background_end:  end time of a preceding BG
#     :param fol_background_start: start time of a following BG
#     :return:    whitened_fft_target: whitened fft of a target segment
#                 whitened_fft_PBG: whitened fft of a preceding segment
#                 whitened_fft_FBG: whitened fft of a following segment
#                 sample_rate: sampling rate of this channel
#                 DURATION: a duration of a target segment
#     '''
#     sample_rate = target_timeseries.sample_rate.value
#     DURATION = target_timeseries.duration.value
#     fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10  # fourth decimal number
#     DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001  # truncate at the fourth decimal number
#     # revise the duration to make an error in whitening process not not to occur
#     # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
#     third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
#     if third_decimal_num_of_duration % 2 == 1:  # odd number at the third decimal point
#         DURATION = DURATION - 0.001  # make the duration with an even number at the third decimal point
#     elif third_decimal_num_of_duration % 2 == 0:  # even number at the third decimal point
#         pass  # keep the original duration as it is
#     DURATION = round(DURATION, 3)  # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
#     pre_background_start = pre_background.span[0] # start time of a preceding BG
#     pre_background_end = pre_background.span[1] # end time of a preceding BG
#     fol_background_start = fol_background.span[0] # start time of a following BG
#     fol_background_end = fol_background.span[1] # end time of a following BG
#     if pre_background_start != 0 and fol_background_start != 0:  # if both BGs are available
#         # average background ASD
#         try:
#             pre_background_fft = pre_background.average_fft(DURATION, window='hanning').abs()# averaged fft of the preceding BG
#             fol_background_fft = fol_background.average_fft(DURATION, window='hanning').abs() # averaged fft of the following BG
#         except ValueError:
#             raise ValueError
#         ave_background_fft = (pre_background_fft + fol_background_fft) / 2.  # average of those two BG ASD
#         # for the preceding BG
#         if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment
#             res_fft_PBG = np.array([]) # residual
#             # number of BG trials
#             iterate = int(pre_background.duration.value / DURATION)
#             for i in range(iterate): # iterate though the number of trials
#                 # iterate from the starting time of the preceding BG
#                 crop_start = pre_background_start + i * DURATION # start time of the cropping
#                 crop_end = crop_start + DURATION # ending time of of the cropping
#                 fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it
#                 res_fft_PBG_ind = fft_PBG - ave_background_fft # calcurate a residual of the cropped BG from the averaged BG
#                 res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array
#                 # iterate from the ending time of the preceding BG
#                 crop_end = pre_background_end - i * DURATION # ending time of of the cropping
#                 crop_start = crop_end - DURATION # start time of the cropping
#                 fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it
#                 res_fft_PBG_ind = fft_PBG - ave_background_fft # calcurate a residual of the cropped BG from the averaged BG
#                 res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind)  # append to the whole trials array
#         else: # if the preceding BG duration is the same as that of the target segment
#             fft_PBG = pre_background.crop(start=pre_background_end - DURATION).average_fft(DURATION, window='hanning').abs()
#             # whitening the preceding BG using the average BG
#             res_fft_PBG = fft_PBG - ave_background_fft
#         # for the following BG
#         if fol_background.duration.value > DURATION:  # if the following BG duration is longer than that of the target segment
#             res_fft_FBG = np.array([])
#             # number of BG trials
#             iterate = int(fol_background.duration.value / DURATION)
#             for i in range(iterate): # iterate though the number of trials
#                 # iterate from the starting time of the following BG
#                 crop_start = fol_background_start + i * DURATION # start time of the cropping
#                 crop_end = crop_start + DURATION # ending time of of the cropping
#                 fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it
#                 res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of  the cropped BG using the averaged BG
#                 res_fft_FBG = np.append(res_fft_FBG, res_fft_FBG_ind) # append to the whole trials array
#                 # iterate from the ending time of the following BG
#                 crop_end = fol_background_end - i * DURATION  # ending time of of the cropping
#                 crop_start = crop_end - DURATION # start time of the cropping
#                 fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
#                 res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG using the averaged BG
#                 res_fft_FBG = np.append(res_fft_FBG, res_fft_FBG_ind) # append to the whole trials array
#         else: # if the following BG duration is the same as that of the target segment
#             fft_FBG = fol_background.crop(start=fol_background_start, end=fol_background_start + DURATION).average_fft(DURATION, window='hanning').abs()
#             # whitening the following BG using the average BG
#             res_fft_FBG = fft_FBG - ave_background_fft
#     elif pre_background_start != 0 and fol_background_start == 0:  # if there is an available proceeding background
#         # fft of a preceding BG
#         try:
#             pre_background_fft = pre_background.average_fft(DURATION, window='hanning').abs()  # fft of the preceding BG
#         except ValueError:
#             raise ValueError
#         ave_background_fft = pre_background_fft  # average of those two BG ASD
#         # for the preceding BG
#         if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment
#             res_fft_PBG = np.array([])
#             # number of BG trials
#             iterate = int(pre_background.duration.value / DURATION)
#             for i in range(iterate): # iterate though the number of trials
#                 # iterate from the starting time of the preceding BG
#                 crop_start = pre_background_start + i * DURATION # start time of the cropping
#                 crop_end = crop_start + DURATION # ending time of of the cropping
#                 fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it
#                 res_fft_PBG_ind = fft_PBG - ave_background_fft # calculate a residual of the cropped BG from the averaged BG
#                 res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array
#                 # iterate from the ending time of the preceding BG
#                 crop_end = pre_background_end - i * DURATION # ending time of of the cropping
#                 crop_start = crop_end - DURATION # start time of the cropping
#                 fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop it
#                 res_fft_PBG_ind = fft_PBG - ave_background_fft # calculate a residual of  the cropped BG using the averaged BG
#                 res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind)  # append to the whole trials array
#         else: # if the preceding BG duration is the same as that of the target segment
#             fft_PBG = pre_background.crop(start=pre_background_end - DURATION).average_fft(DURATION, window='hanning').abs()
#             # whitening the preceding BG using the average BG
#             res_fft_PBG = fft_PBG - ave_background_fft
#         whitened_fft_FBG = None # it is not calculated
#     elif pre_background_start == 0 and fol_background_start != 0:  # if there is an available proceeding background
#         try:
#             fol_background_fft = fol_background.average_fft(DURATION, window='hanning').abs()  # fft of the following BG
#         except ValueError:
#             raise ValueError
#         ave_background_fft = fol_background_fft  # average of those two BG ASD
#         whitened_fft_PBG = None # it is not calculated
#         if fol_background.duration.value > DURATION:  # if the following BG duration is longer than that of the target segment
#             res_fft_FBG = np.array([])
#             # number of BG trials
#             iterate = int(fol_background.duration.value / DURATION)
#             for i in range(iterate): # iterate though the number of trials
#                 # iterate from the starting time of the following BG
#                 crop_start = fol_background_start + i * DURATION # start time of the cropping
#                 crop_end = crop_start + DURATION # ending time of of the cropping
#                 fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop it
#                 res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG from the averaged BG
#                 res_fft_FBG = np.append(res_fft_FBG, res_fft_FBG_ind) # append to the whole trials array
#                 # iterate from the ending time of the following BG
#                 crop_end = fol_background_end - i * DURATION  # ending time of of the cropping
#                 crop_start = crop_end - DURATION # start time of the cropping
#                 fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop it
#                 res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG from the averaged BG
#                 whitened_fft_FBG = np.append(whitened_fft_FBG, res_fft_FBG_ind) # append to the whole trials array
#         else: # if the following BG duration is the same as that of the target segment
#             fft_FBG = fol_background.crop(start=fol_background_start, end=fol_background_start + DURATION).average_fft(DURATION, window='hanning').abs()
#             # whitening the following BG using the average BG
#             res_fft_FBG = fft_FBG - ave_background_fft
#     # whitening Target
#     fft_target = target_timeseries.average_fft(DURATION, window='hanning').abs() # fft of the target
#     # whitening the target using the average BG
#     res_fft_target = fft_target - ave_background_fft
#     return res_fft_target, res_fft_PBG, res_fft_FBG, sample_rate, DURATION



[docs]def SaveTargetAndBackGroundHDF5_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False'):
    '''
    description:
        THIS IS USED FOR "OFFLINE" MODE
        0. assuming Listsegments is given by Findglitchlist()
        1.take the information of the list of allowed target and a preceding and following segments
        2. whiten a target segmement based on the average background segment
        3. find whitened FFT
        4. save the whitened target and backgrounds FFTs
        Note this depends on
    USAGE: SaveTargetAndBackGroundHDF5(Listsegments, re_sfchs, IFO, outputpath, outputfilename, mode=='offline')

    :param Listsegments: a list of segment parameters
    :param channels: a list of safe channels
    :param outputpath: a directory of an output file
    :param outputfilename: a name of an output file
    :param number_process: a number of processes in parallel
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
    '''
    # check if the output directory exists
    if os.path.isdir(outputpath) is False:
        print("The directory for the output HDF5 file does not exist: path = {}".format(outputpath))
        print("Please specify the existing directory for it.")
        sys.exit()
    # the list of the safe channels
    channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
    # the path and the file name
    outputPathFileName = os.path.join(outputpath, outputfilename)
    # make an output file
    try:
        f = h5py.File(outputPathFileName, 'w-')
        f.close()
    except IOError:
        print('The file already exists in {}'.format(outputPathFileName))
        print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
        sys.exit()
    # open a HDF5 file in writing mode
    with h5py.File(outputPathFileName, 'r+') as f:
        for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
            t0 = time.time()
            try: # deal with time series in bad state
                if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                else:
                    pass
                channels_used = channels
                if PlusHOFT == 'True': # get the data of hoft
                    if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0:  # if there is an available proceeding background and if there is NO available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is NO available proceeding background and if there is an available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    else:
                        pass
                    full_timeseriesDict.append(full_timeseriesDict_hoft) # append
                    del full_timeseriesDict_hoft # delete the hoft object
                    channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
                # =====================================================
            except IndexError:
                print('Bad data, go to a next GPS')
                continue # go to a next GPS time
            except RuntimeError:
                print('RuntimeError, go to a next GPS')
                continue
            except IOError: # error when getting time series
                print('IOError, go to a next GPS')
                continue
            except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
                print('ValueError, go to a next GPS')
                continue
            g = f.create_group('gps{:05d}'.format(IndexSatisfied))
            g.attrs['GPS'] = gpstime # a peak GPS time
            g.attrs['duration'] = duration
            g.attrs['snr'] = SNR # snr
            g.attrs['confidence'] = confi # confidence
            g.attrs['id'] = ID
            g.attrs['ifo'] = channels[0].split(':')[0]
            try:
                ch_labels = range(len(channels_used)) # a list of channel labels
                List_whitened_fft_target = [] # a list of whitened fft of target
                List_whitened_fft_PBG = [] # a list of whitened fft of PBG
                List_whitened_fft_FBG = [] # a list of whitened fft of FBG
                List_sample_rate = [] # a list of sample rates
                List_DURATION = [] # a list of durations
                # whitened target and BG segments with multi processes
                with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
                    for whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, DURATION in executor.map(Multiprocess_whitening,
                                                                                                    full_timeseriesDict.values(),
                                                                                                    itrtls.repeat(target_timeseries_start),
                                                                                                    itrtls.repeat(target_timeseries_end),
                                                                                                    itrtls.repeat(pre_background_start),
                                                                                                    itrtls.repeat(pre_background_end),
                                                                                                    itrtls.repeat(fol_background_start),
                                                                                                    itrtls.repeat(fol_background_end)
                                                                                                    ):
                        List_whitened_fft_target.append(whitened_fft_target) # fill
                        List_whitened_fft_PBG.append(whitened_fft_PBG) # fill
                        List_whitened_fft_FBG.append(whitened_fft_FBG) # fill
                        List_sample_rate.append(sample_rate) # fill
                        List_DURATION.append(DURATION) # fill

                for ch_label in ch_labels: # iterate through channel labels
                    dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9)  # target
                    dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label]  # target
                    dsetTarget.attrs['channel'] = channels_used[ch_label].split(':')[1]  # target
                    dsetTarget.attrs['duration'] = List_DURATION[ch_label]  # target
                    if List_whitened_fft_PBG[ch_label] != None: # if the preceding BG is available, this condition is required for the "Either" BG selection
                        dsetBG = g.create_dataset("chP{:05d}".format(ch_label), data=List_whitened_fft_PBG[ch_label], compression="gzip", compression_opts=9)  # BG
                        dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label]  # BG
                        dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1]  # BG
                        dsetBG.attrs['duration'] = List_DURATION[ch_label]  # BG
                    else: # if the preceding BG is NOT available
                        pass
                    if List_whitened_fft_FBG[ch_label] != None: # if the following BG is available,  this condition is required for the "Either" BG selection
                        dsetBG = g.create_dataset("chF{:05d}".format(ch_label), data=List_whitened_fft_FBG[ch_label], compression="gzip", compression_opts=9)  # BG
                        dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label]  # BG
                        dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1]  # BG
                        dsetBG.attrs['duration'] = List_DURATION[ch_label]  # BG
                    else: # if the following BG is NOT available
                        pass
            except ValueError:
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
                continue # go to a next GPS time
            except IOError: # IOError: Driver write request failed
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
                continue # go to a next GPS time
            print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied))
            t1 = time.time()
            print('Time taken: {} s'.format(t1 - t0))




[docs]def SaveTargetAndBackGroundHDF5_ONLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT):
    '''
    description:
        THIS IS USED FOR "ONLINE" MODE
        0. assuming Listsegments is given by Findglitchlist()
        1.take the information of the list of allowed target and a preceding and following segments
        2. whiten a target segmement based on the average background segment
        3. find whitened FFT
        4. save the whitened target and backgrounds FFTs
        Note this depends on Multiprocess_whitening()
    USAGE: SaveTargetAndBackGroundHDF5(Listsegments, re_sfchs, IFO, outputpath, outputfilename, mode=='offline')

    :param Listsegments: a list of segment parameters
    :param channels: a list of safe channels
    :param outputpath: a directory of an output file
    :param outputfilename: a name of an output file
    :param number_process: a number of processes in parallel
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}
    '''
    # the list of the safe channels
    channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
    # the path and the file name
    outputPathFileName = os.path.join(outputpath, outputfilename)
    # make an output file and open it
    try:
        f = h5py.File(outputPathFileName, 'w-')
        f.close()
    except IOError:
        print('The file already exists in {}'.format(outputPathFileName))
        print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
        sys.exit()
    with h5py.File(outputPathFileName, 'r+') as f:
        for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
            t0 = time.time()
            # extracting the target data and the proceeding and following backgrounds
            try: # deal with time series in bad state
                if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
                    full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                else:
                    pass
                channels_used = channels
                if PlusHOFT == 'True': # get the data of hoft
                    if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0:  # if there is an available proceeding background and if there is NO available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is NO available proceeding background and if there is an available following background
                        full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                    else:
                        pass
                    full_timeseriesDict.append(full_timeseriesDict_hoft) # append
                    del full_timeseriesDict_hoft # delete the hoft object
                    channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
            except IndexError:
                # IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099)
                continue # go to a next GPS time
            except RuntimeError:
                continue # go to a next GPS time
            except IOError: # error when getting time series
                print('IOError, go to a next GPS')
                continue
            except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
                print('ValueError, go to a next GPS')
                continue
            g = f.create_group('gps{:05d}'.format(IndexSatisfied))
            g.attrs['GPS'] = gpstime # a peak GPS time
            g.attrs['duration'] = duration
            g.attrs['snr'] = SNR # snr
            g.attrs['confidence'] = confi # confidence
            g.attrs['id'] = ID
            g.attrs['ifo'] = channels[0].split(':')[0]
            try:
                ch_labels = range(len(channels_used)) # a list of channel labels
                List_whitened_fft_target = [] # a list of whitened fft of target
                List_whitened_fft_PBG = [] # a list of whitened fft of PBG
                List_whitened_fft_FBG = [] # a list of whitened fft of FBG
                List_sample_rate = [] # a list of sample rates
                List_DURATION = [] # a list of durations
                # whitened target and BG segments with multi processes
                with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
                    for whitened_fft_target, whitened_fft_FBG, whitened_fft_PBG, sample_rate, DURATION in executor.map(Multiprocess_whitening,
                                                                                                        full_timeseriesDict.values(),
                                                                                                    itrtls.repeat(target_timeseries_start),
                                                                                                    itrtls.repeat(target_timeseries_end),
                                                                                                    itrtls.repeat(pre_background_start),
                                                                                                    itrtls.repeat(pre_background_end),
                                                                                                    itrtls.repeat(fol_background_start),
                                                                                                    itrtls.repeat(fol_background_end)
                                                                                                    ):
                        List_whitened_fft_target.append(whitened_fft_target) # fill
                        List_whitened_fft_PBG.append(whitened_fft_PBG) # fill
                        List_whitened_fft_FBG.append(whitened_fft_FBG) # fill
                        List_sample_rate.append(sample_rate) # fill
                        List_DURATION.append(DURATION) # fill
                for ch_label in ch_labels: # iterate through channel labels
                    dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9)  # target
                    dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label]  # target
                    dsetTarget.attrs['channel'] = channels_used[ch_label].split(':')[1]  # target
                    dsetTarget.attrs['duration'] = List_DURATION[ch_label]  # target
                    if List_whitened_fft_PBG[ch_label] != None:  # if the preceding BG is available, this condition is required for the "Either" BG selection
                        dsetBG = g.create_dataset("chP{:05d}".format(ch_label), data=List_whitened_fft_PBG[ch_label], compression="gzip", compression_opts=9)  # BG
                        dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label]  # BG
                        dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1]  # BG
                        dsetBG.attrs['duration'] = List_DURATION[ch_label]  # BG
                    else:  # if the preceding BG is NOT available
                        pass
                    if List_whitened_fft_FBG[ch_label] != None: # if the following BG is available, this condition is required for the "Either" BG selection
                        dsetBG = g.create_dataset("chF{:05d}".format(ch_label), data=List_whitened_fft_FBG[ch_label], compression="gzip", compression_opts=9)  # BG
                        dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label]  # BG
                        dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1]  # BG
                        dsetBG.attrs['duration'] = List_DURATION[ch_label]  # BG
                    else: # if the following BGfull_timeseriesDict is NOT available
                        pass
            except ValueError:
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
                continue
            except IOError: # IOError: Driver write request failed
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to memory error  ***'.format(IndexSatisfied))
                continue # go to a next GPS time
            yield g # pass a group object as a generator
            t1 = time.time()
            print('Time taken: {} s'.format(t1 - t0))






[docs]def SaveTargetAndBackGroundHDF5_TimeShift(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False'):
    '''
    description:
        THIS IS USED FOR "OFFLINE" MODE
        0. assuming Listsegments is given by Findglitchlist()
        1.take the information of the list of allowed target and a preceding and following segments
        2. whiten a target segmement based on the average background segment
        3. find whitened FFT
        4. save the whitened target and backgrounds FFTs
        Note this depends on
    USAGE: SaveTargetAndBackGroundHDF5(Listsegments, re_sfchs, IFO, outputpath, outputfilename, mode=='offline')

    :param Listsegments: a list of segment parameters
    :param channels: a list of safe channels
    :param outputpath: a directory of an output file
    :param outputfilename: a name of an output file
    :param number_process: a number of processes in parallel
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
    '''
    # the list of the safe channels
    channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
    # the path and the file name
    outputPathFileName = os.path.join(outputpath, outputfilename)
    # make an output file
    try:
        f = h5py.File(outputPathFileName, 'w-')
        f.close()
    except IOError:
        print('The file already exists in {}'.format(outputPathFileName))
        print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
        sys.exit()
    # open a HDF5 file in writing mode, create a file if not exist yet
    with h5py.File(outputPathFileName, 'r+') as f:
        for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
            t0 = time.time()
            try: # deal with time series in bad state
                if IndexSatisfied == 0:
                    if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                        full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
                        full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
                        full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                    else:
                        pass
                    if PlusHOFT == 'True': # get the data of hoft
                        if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                            full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                        elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0:  # if there is an available proceeding background and if there is NO available following background
                            full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                        elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is NO available proceeding background and if there is an available following background
                            full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                        else:
                            pass
                        full_timeseriesDict.append(full_timeseriesDict_hoft) # append
                        del full_timeseriesDict_hoft  # delete the hoft object
                        channels = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
                    previous_full_timeseriesDict = full_timeseriesDict
                    previous_pre_background_start = pre_background_start
                else:
                    if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                        full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
                        full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                    elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
                        full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
                    else:
                        pass
                    if PlusHOFT == 'True': # get the data of hoft
                        if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                            full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                        elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0:  # if there is an available proceeding background and if there is NO available following background
                            full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                        elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is NO available proceeding background and if there is an available following background
                            full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
                        else:
                            pass
                        full_timeseriesDict.append(full_timeseriesDict_hoft) # append
                        del full_timeseriesDict_hoft  # delete the hoft object
                        channels = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
                    full_timeseriesDict.prepend(previous_full_timeseriesDict.crop(start=previous_pre_background_start))
                    previous_full_timeseriesDict = full_timeseriesDict
                    previous_pre_background_start = pre_background_start
                # =====================================================
            except IndexError:
                print('Bad data, go to a next GPS')
                continue # go to a next GPS time
            except RuntimeError:
                print('RuntimeError, go to a next GPS')
                continue
            except IOError: # error when getting time series
                print('IOError, go to a next GPS')
            except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
                print('ValueError, go to a next GPS')
            t2 = time.time()
            print('Time taken for query: {} s'.format(t2 - t0))
            g = f.create_group('gps{:05d}'.format(IndexSatisfied))
            g.attrs['GPS'] = gpstime # a peak GPS time
            g.attrs['duration'] = duration
            g.attrs['snr'] = SNR # snr
            g.attrs['confidence'] = confi # confidence
            g.attrs['id'] = ID
            g.attrs['ifo'] = channels[0].split(':')[0]
            try:
                ch_labels = range(len(channels)) # a list of channel labels
                List_whitened_fft_target = [] # a list of whitened fft of target
                List_whitened_fft_PBG = [] # a list of whitened fft of PBG
                List_whitened_fft_FBG = [] # a list of whitened fft of FBG
                List_sample_rate = [] # a list of sample rates
                List_DURATION = [] # a list of durations
                # whitened target and BG segments with multi processes
                with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
                    for whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, DURATION in executor.map(Multiprocess_whitening,
                                                                                                    full_timeseriesDict.values(),
                                                                                                    itrtls.repeat(target_timeseries_start),
                                                                                                    itrtls.repeat(target_timeseries_end),
                                                                                                    itrtls.repeat(pre_background_start),
                                                                                                    itrtls.repeat(pre_background_end),
                                                                                                    itrtls.repeat(fol_background_start),
                                                                                                    itrtls.repeat(fol_background_end)
                                                                                                    ):
                        List_whitened_fft_target.append(whitened_fft_target) # fill
                        List_whitened_fft_PBG.append(whitened_fft_PBG) # fill
                        List_whitened_fft_FBG.append(whitened_fft_FBG) # fill
                        List_sample_rate.append(sample_rate) # fill
                        List_DURATION.append(DURATION) # fill
                for ch_label in ch_labels: # iterate through channel labels
                    dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9)  # target
                    dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label]  # target
                    dsetTarget.attrs['channel'] = channels[ch_label].split(':')[1]  # target
                    dsetTarget.attrs['duration'] = List_DURATION[ch_label]  # target
                    if List_whitened_fft_PBG[ch_label] != None: # if the preceding BG is available, this condition is required for the "Either" BG selection
                        dsetBG = g.create_dataset("chP{:05d}".format(ch_label), data=List_whitened_fft_PBG[ch_label], compression="gzip", compression_opts=9)  # BG
                        dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label]  # BG
                        dsetBG.attrs['channel'] = channels[ch_label].split(':')[1]  # BG
                        dsetBG.attrs['duration'] = List_DURATION[ch_label]  # BG
                    else: # if the preceding BG is NOT available
                        pass
                    if List_whitened_fft_FBG[ch_label] != None: # if the following BG is available,  this condition is required for the "Either" BG selection
                        dsetBG = g.create_dataset("chF{:05d}".format(ch_label), data=List_whitened_fft_FBG[ch_label], compression="gzip", compression_opts=9)  # BG
                        dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label]  # BG
                        dsetBG.attrs['channel'] = channels[ch_label].split(':')[1]  # BG
                        dsetBG.attrs['duration'] = List_DURATION[ch_label]  # BG
                    else: # if the following BG is NOT available
                        pass
            except ValueError:
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
                continue # go to a next GPS time
            except IOError: # IOError: Driver write request failed
                del f['gps{:05d}'.format(IndexSatisfied)]
                print('*** delete group gps{:05d} due to memory error  ***'.format(IndexSatisfied))
                continue # go to a next GPS time
            print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied))
            t1 = time.time()
            print('Time taken: {} s'.format(t1 - t0))



#+++++++++++++++++++++++++++++++++++++++++++++++
# plot tools
#+++++++++++++++++++++++++++++++++++++++++++++++


[docs]class PlotTableAnalysis():
    def __init__(self):
        self.f = None # a HDF5 file storing a glitch class

[docs]    def getHDF5Object(self):
        '''
        show a HDF5 file object
        USAGE: getHDF5Object()

        :return: a dictionary
        '''
        if self.f == None:
            print('A HDF5 file object is not set yet')
        else:
            return self.f

[docs]    def setHDF5Object(self, input_dir, input_file):
        '''
        set a HDF5 file object

        :param f_dict: a dictionary of time series data sets
        :return: None
        '''
        input_path = os.path.join(input_dir, input_file)
        f_input = h5py.File(input_path, 'r') # read it in the read mode
        self.f = f_input

[docs]    def FindNumberSample(self):
        '''
        description:
            find the number of samples

        :return: the number of samples
        '''
        number_sample = len(list(self.f.keys()))
        return number_sample

[docs]    def FindNumberChannels(self, g):
        '''
        description:
            count a number of channels that are analyzed
        USAGE: NumberOfChannels = FindNumberChannels(g)

        :param g: a HDF file group object
        :return: NumberOfChannels: a number of channels that are analyzed
        '''
        TPF_list = np.array([str(dset)[2] for dset in list(g.keys())])
        NumberOfChannels = len(np.where(TPF_list == 'T')[0])
        return NumberOfChannels



[docs]    def HierarchyChannelAboveThreshold(self, g, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None'):
        '''
        description:
            calculate values of importance which is the number of frequency bins above a the off-source window for channels for a given time of a glitch
        USAGE: RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, duration  = HierarchyChannelAboveThreshold(g, LowerCutOffFreq, UpperCutOffFreq, sigma=10)

        :param g: (hdf5 format) a group having a glitch
        :param pt: a value of area integrating a distribution upto
        :param sigma: value of standard deviation of ratio of medians to determine important channels
        :return:
            RankingChannelAndRatioMed: # a list of channel names with their ratios descended by ratios
            Importantchannels: # channels with their ratio is greater than the threshold of ratio, along with their ratios
        '''
        SNR = g.attrs['snr']
        GPS = g.attrs['GPS']
        confidence = g.attrs['confidence']
        ID = g.attrs['id']
        DURATION = g.attrs['duration']
        ListChannelName = np.array([])
        ListCount = np.array([])
        NumberOfChannels = self.FindNumberChannels(g)
        for i in range(NumberOfChannels):
            dsetT = g['chT{:05d}'.format(i)] # a target data set
            try: # this is required in the condition for "Either" BG selection
                dsetP = g['chP{:05d}'.format(i)] # a preceding BG data set
            except KeyError:
                dsetP = None
            try: # this is required in the condition for "Either" BG selection
                dsetF = g['chF{:05d}'.format(i)]  # a following BG data set
            except KeyError:
                dsetF = None
            ChannelName = dsetT.attrs['channel'].decode('utf-8') # channel name
            duration = dsetT.attrs['duration'] # duration
            sampling_rate = dsetT.attrs['SamplingRate']  # sampling rate of this channel
            spT = dsetT[()] # target data set pixel value
            if dsetP != None: # this is required in the condition for "Either" BG selection
                spP = dsetP[()] # a preceding BG data set pixel value
            else:
                spP = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True:
            if dsetF != None: # this is required in the condition for "Either" BG selection
                spF = dsetF[()]  # a following BG data set pixel value
            else:
                spF = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True:
            #=======================
            if dsetP != None:
                Number_PBG_trials = spP.shape[0] / spT.shape[0]
                spP = spP.reshape(int(Number_PBG_trials), spT.shape[0])
            if dsetF != None:
                Number_FBG_trials = spF.shape[0] / spT.shape[0]
                spF = spF.reshape(int(Number_FBG_trials), spT.shape[0])
            #=======================
            #++++++ for the case where there is no data measured at a channel +++++++++++++++++++
            #++++++ That channel has no contribution to h(t) so that we can define Count = 0 +++++++++++
            #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
            if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True:  # if target and BG have data measured
                pT = spT #** 2 # power of target
                if dsetP != None:
                    pP = spP #** 2 # power of a preceding BG
                if dsetF != None:
                    pF = spF #** 2  # power of a following BG
                # low, high, or band pass filter if we know which frequency band is important
                if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter
                    ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
                    if sampling_rate / 2 <= LowerCutOffFreq:
                        pT = np.array([0])
                        pP = np.array([0])
                        pF = np.array([0])
                    else:
                        LowerCutOffElement = int(LowerCutOffFreq*duration)
                        if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector
                            pT = pT[LowerCutOffElement:]
                            if dsetP != None:
                                pP = pP[:, LowerCutOffElement:]
                            if dsetF != None:
                                pF = pF[:, LowerCutOffElement:]
                        else: # if LowerCutOffElement is greater than the length of the vector
                            pT = np.array([0])
                            if dsetP != None:
                                pP = np.array([0])
                            if dsetF != None:
                                pF = np.array([0])
                elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter
                    # even if UpperCutOffElement is greater than the length of the vector is works fine
                    UpperCutOffElement = int(UpperCutOffFreq * duration)
                    pT = pT[:UpperCutOffElement]
                    if dsetP != None:
                        pP = pP[:, :UpperCutOffElement]
                    if dsetF != None:
                        pF = pF[:, :UpperCutOffElement]
                elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None':  # band pass filter
                    ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
                    if sampling_rate / 2 <= LowerCutOffFreq:
                        pT = np.array([0])
                        pP = np.array([0])
                        pF = np.array([0])
                    else:
                        LowerCutOffElement = int(LowerCutOffFreq * duration)
                        UpperCutOffElement = int(UpperCutOffFreq * duration)
                        if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector
                            pT = pT[LowerCutOffElement:UpperCutOffElement]
                            if dsetP != None:
                                pP = pP[:, LowerCutOffElement:UpperCutOffElement]
                            if dsetF != None:
                                pF = pF[:, LowerCutOffElement:UpperCutOffElement]
                        else: # if LowCutOffElement is greater than the length of the vector
                            pT = np.array([0])
                            if dsetP != None:
                                pP = np.array([0])
                            if dsetF != None:
                                pF = np.array([0])
                else:
                    pass
                if dsetP != None and dsetF != None:
                    pB = np.concatenate((pP, pF), axis=0)
                elif dsetP != None and dsetF == None:
                    pB = pP
                elif dsetP == None and dsetF != None:
                    pB = pF
                GausCeiling = pB.mean() + sigma * pB.std(ddof=1) # gaussian noise ceiling
                ########a metric with amplitude ratio ########
                #try:
                #    GausCeiling = pB.sum(axis=1).mean() + pB.sum(axis=1).std(ddof=1)
                #except ValueError: # if pB is [0 0]
                #    GausCeiling = 0
                ##################################################
                #pt = 0.95
                #histB, edgB = np.histogram(pB, bins=100, density=True) # histgram of the background
                #deltaB = edgB[1] - edgB[0] # the length of the bin
                #cumnumB = np.cumsum(histB * deltaB) # cumulative number
                #UpperBoundB = edgB[np.where(cumnumB > pt)[0][0]:np.where(cumnumB > pt)[0][0] + 2].mean()  # median value
                # Note that even if pT, pB do not have elements in them, UpperBoundB becomes = 0.0050 so that there is no element in pT above the value as
                # I set pT = np.array([0]) in this case. Eventually, Count = 0, which is ok
                #Count = pT[pT > UpperBoundB].shape[0] / float(pT.shape[0])  # a fraction of components above the ceiling
                ############################
                try:
                    Count = pT[pT > GausCeiling].shape[0] / float(pT.shape[0]) # a fraction of components above the ceiling
                    ########### a metric with amplitude ratio ###############
                    #Count = pT.sum() / GausCeiling
                    #if Count < 0:
                    #    Count = 0
                except ZeroDivisionError: # in the situation where the number of the total elments is zero, I saw GausCeiling is nan. So this channel has mulfunction. I manually set Count = 0
                    Count = 0
                # if the data is flat over the stretch of the segment, the upper bound threshold of BG called "GausCeiling" becomes zero (BG has to flactuate so that "GausCeiling" equal to zero is fake)
                # if so that importance called "Count" becomes 1. To prevent this fake importance, I set Count = 0 manually
                if GausCeiling == 0.:
                    Count = 0
            else:
                Count = 0 # manually put zero because this channel has no contribution at this glitch ID
            ListCount = np.append(ListCount, Count)
            ListChannelName = np.append(ListChannelName, ChannelName)
        #+++++++++ find a hierarchy of channels +++++++++++++++++++++
        RankingChannelAndCount = [[ListChannelName[i], ListCount[i]] for i in np.argsort(ListCount)[::-1]] # a list of channel names with maximum value of SNR series
        return RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, DURATION



[docs]    def CreateMatCount(self, sigma, g_Individual = None, LowerCutOffFreq='None', UpperCutOffFreq='None'):
        '''
        description:
            1. find counts for each glitch
            2. stack over all the gliches
            3. make a matix comprising importance versus channels
        USAGE: MatCount, ListChannelName, ListSNR, ListConf, ListGPS, ListDuration = CreateMatCount(sigma, LowerCutOffFreq='None', UpperCutOffFreq='None') # for all glitches
                MatCount, ListChannelName, ListSNR, ListConf = CreateMatCount(sigma, g_Individual, LowerCutOffFreq='None', UpperCutOffFreq='None') # for an individual glitch

        :param sigma sigma: an integer to determine the upper bound of the off-source window
        :param g_Individual: a grounp of a HDF5 file that has values of importance for a gltich
        :param LowerCutOffFreq: a lower limit frequency cut to calculate a value of importance
        :param UpperCutOffFreq: an uppper limit frequency cut to calculate a value of importance
        :return:
            MatCount: a matrix comprising importance versus channels
            ListChannelName: a list of channel names
            ListSNR: a list of SNRs
            ListConf: a list of confidence
        dependencies: self.HierarchyChannelAboveThreshold(g, LowerCutOffFreq, UpperCutOffFreq)
        '''
        if g_Individual == None: # for all the glitches in a glitch class
            MatCount = np.array([])
            ListSNR = np.array([])
            ListConf = np.array([])
            ListGPS = np.array([])
            ListDuration = np.array([])
            num_dataset = 0 # a number of datasets in a group, set to be 0 as a first instance
            for g_label in list(self.f.keys()): # iterate through all the groups
                try:
                    g = self.f[g_label]
                except KeyError: # KeyError: 'Unable to open object (Bad object header version number)'
                    continue
                # the following if statement is for the solution where no dataset in a group
                if len(g.keys()) < num_dataset: # if a number of dataset in this group is less than that of the previous group
                    continue # go to the next iteration
                num_dataset = len(g.keys()) # update a number of datasets with that of the current group
                # get an output given by HierarchyChannelAboveThreshold()
                RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, duration = self.HierarchyChannelAboveThreshold(g, sigma, LowerCutOffFreq, UpperCutOffFreq)
                # stack the vector of counts
                MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount
                # append SNR
                ListSNR = np.append(ListSNR, SNR)
                # append confidence level
                ListConf = np.append(ListConf, confidence)
                # append GPS time
                ListGPS = np.append(ListGPS, GPS)
                # append duration
                ListDuration = np.append(ListDuration, duration)
            return MatCount, ListChannelName, ListSNR, ListConf, ListGPS, ListDuration
        else: # individual glitch
            MatCount = np.array([])
            ListSNR = np.array([])
            ListConf = np.array([])
            # get an output given by HierarchyChannelAboveThreshold()
            RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, duration = self.HierarchyChannelAboveThreshold(g_Individual, sigma, LowerCutOffFreq, UpperCutOffFreq)
            # stack the vector of counts
            MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount
            # append SNR
            ListSNR = np.append(ListSNR, SNR)
            # append confidence level
            ListConf = np.append(ListConf, confidence)
            # normalize MatCount by a number of glitches
            MatCount = MatCount
            return MatCount, ListChannelName, ListSNR, ListConf



[docs]    def CreateChannelTicks(self, ListChannelName):
        '''
        description:
            1. take the dominant sub-channel names
            2. get a list of index where a sub-sensor name changes
        dependencies: (tacitly) CreateMatCount(), make_subset_channel_based_on_samplingrate()
        USAGE: CenterTicks, ListInd, ListSubsys = CreateChannelTicks(ListChannelName)

        :param ListChannelName: a list of channel names
        :return:
            CenterTicks: center value of a dominant sub sensor belongs
            ListInd: the edge indices of a dominant sub sensor belongs
            ListSubsys: a list of dominant sensor names
        '''
        # +++ take the dominant sub-channel names +++++++++
        ListSubsys = np.array([])  # a list of dominant sub sensor names
        for i in ListChannelName:  # iterate over channel names
            Subsys = i.split('-')[0]  # take before '-'
            ListSubsys = np.append(ListSubsys, Subsys)  # append

        # ++++ get a list of index where a sub-sensor name changes +++++++++++
        i = 'dummy'
        ListInd = np.array([])  # a list of indices where dominant channel name changes
        for ind, j in enumerate(ListSubsys, 0):  # iterate dominant sub sensor names
            if i != j:  # if name differencee occurs
                ListInd = np.append(ListInd, ind)  # store an index
            i = j  # rename current dominant sub name
        # append the end index
        ListInd = np.append(ListInd, np.array([ListSubsys.shape[0] - 1]))
        # find the center index of a dominant sub-channel
        CenterTicks = np.array([(i + j) / 2. if j - i != 1 else i for i, j in zip(ListInd[:-1], ListInd[1:])])
        return CenterTicks, ListInd, ListSubsys



[docs]    def PlotImportanceVSChannel(self, MatCount, ListChannelName, output_dir, output_file, ax=None):
        '''
        description:
            - for all glitches in a class, this method works stand-alone
            1. convert number to channel names in x ticks 
            2. color background based on channel types 
            3. plot a bar showing importance of channels 
            USAGE: PlotImportanceVSChannel(MatCount, ListChannelName, output_dir, output_file)
            - for an individual glitch in a class, this method is used by ....
            1. convert number to channel names in x ticks 
            2. color background based on channel types 
            3. plot a bar showing importance of channels
        dependencies: CreateChannelTicks() tacitly CreateMatCount()
        USAGE: PlotImportanceVSChannel(MatCount, ListChannelName, None, None, ax) # for on-line mode
        USAGE: PlotImportanceVSChannel(MatCount, ListChannelName, output_dir, output_file) @ for off-line mode

        :param MatCount: a matrix comprising importance versus channels
        :param ListChannelName: a list of channel names
        :param output_dir: (only used for all glitches)
        :param output_file: (only used for all glitches)
        :param ax: matplotlib.pyplot object (used only for an individual glitch)
        :return: None
        '''
        if ax == None: # plot for all glitches in a class
            CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
            # color the background either with black or gray for each channel
            Listcolor = ['black', 'gray'] * 20 * 7
            for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
                if edgeRight == MatCount.shape[-1] - 1: # fix the background color to match plot size
                    edgeRight = edgeRight + 1
                ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
                ax.set_edgecolor(None)
            if MatCount.shape[0] == MatCount.shape[-1]: # for an individual glitch
                MatCount = MatCount.reshape(1, MatCount.shape[-1])
            #++++++++  plot bar plot about importance of channels ++++++++++++++++
            # sum all the counts over glitches
            SumMatCount = np.sum(MatCount, axis=0) / float(MatCount.shape[0])  # normalize MatCount by a number of glitches
            #if MatCount.shape[0] > 1: # if there are more than 1 sample
            #    STDMatCount = MatCount.std(axis=0, ddof=1) # standard deviation
            #else: # if there is only one sample
            #    STDMatCount = np.array([None]*SumMatCount.shape[0])  # standard deviation
            # plot bars about importance
            plt.bar(np.arange(MatCount.shape[-1]), SumMatCount, 1, align='edge', edgecolor=None, color='blue', linewidth=0.)
            #+++++++++++ plot a table of ranked channels +++++++++++++++++++
            # sort channel based on the sum of the counts i.e, importance
            # a list of sorted Channel names & sorted Importance
            RankChannel = list(zip(ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]]))
            #RankChannel = list(zip(ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]], STDMatCount[np.argsort(SumMatCount)[::-1]]))
            # make pandas matrix
            Impdf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Importance'))
            #Impdf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Importance', 'Std'))
            # add ranking label
            Impdf = Impdf.assign(rank=np.arange(1, MatCount.shape[-1] + 1, 1))
            # round importance
            Impdf['Importance'] = Impdf['Importance'].astype(float).round(3)  # = df.value1.round()
            #Impdf['Std'] = Impdf['Std'].astype(float).round(4)
            # re-sort columns to make rank labels coming first
            cols = Impdf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            Impdf = Impdf[cols]
            # take the first 5 high ranking channels
            dcsummary = Impdf.iloc[:5]
            # plot table on the top of the bar plot
            table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=[0.1, 0.6, 0.2])
            #table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top',colWidths=[0.1, 0.6, 0.1, 0.1])
            table.set_fontsize(8)
            #++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
            #++++++++++++++  alter ticks +++++++++++++++++
            # plot xticks using the dominant sub sensor names
            plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
            # put ticks above
            plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
            plt.tick_params(axis="y", labelsize=10)
            plt.grid(False)
            plt.xlim(0, ListSubsys.shape[0])
            # set axis label
            plt.ylabel('Importance', fontsize=15)
            plt.xlabel('Auxiliary channel', fontsize=15)
            plt.tight_layout(rect=[0, 0.03, 1, 0.8])
            # save a plot
            output_path = os.path.join(output_dir, output_file)
            plt.savefig(output_path)
            plt.close()
            print('Completed Saving a plot of Importance VS channel')
            return None

        else: # for an individual glitch
            # ++++++++++++++  alter ticks +++++++++++++++++
            # get xticks for dominant channels
            CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannelName)
            # plot xticks using the dominant sub sensor names
            ax.set_xticks(CenterTicks)
            ax.set_xticklabels(ListSubsys[ListInd.astype(int)])
            # put ticks above
            ax.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-35, length=0, labelrotation=90,size=0, labelsize=15)
            ax.tick_params(axis='y', labelsize=15)
            ax.set_xlim(0, ListSubsys.shape[0])
            # color the background either with blakc or gray for each channel
            # ++++++++  plot bar plot about importance of channels ++++++++++++++++
            Listcolor = ['black', 'gray'] * 20 * 7
            for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
                if edgeRight == MatCount.shape[-1] - 1: # fix the background color to match plot size
                    edgeRight = edgeRight + 1
                ax.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
            if MatCount.shape[0] == MatCount.shape[-1]:  # for an individual glitch
                MatCount = MatCount.reshape(1, MatCount.shape[-1])
            # ++++++++  plot bar plot about importance of channels ++++++++++++++++
            # sum all the counts over glitches
            SumMatCount = np.sum(MatCount, axis=0)
            # +++++++++++ plot a table of high ranking channels +++++++++++++++++++
            # sort channel based on the sum of the counts i.e, importance
            # a list of sorted Channel names & sorted Importance
            RankChannel = list(
                zip(ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]]))
            # make pandas matrix
            Impdf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Importance'))
            # add ranking label
            Impdf = Impdf.assign(rank=np.arange(1, MatCount.shape[-1] + 1, 1))
            # round importance
            Impdf['Importance'] = Impdf['Importance'].astype(float).round(3)  # = df.value1.round()
            # re-sort columns to make ranking labels coming first
            cols = Impdf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            Impdf = Impdf[cols]
            # take the first 5 high ranking channels
            dcsummary = Impdf.iloc[:5]
            # plot bars about importance
            ax.bar(np.arange(MatCount.shape[-1]), SumMatCount, 1, align='edge', edgecolor=None, color='blue', linewidth=0.)
            # plot table on the top of the bar plot
            table = ax.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=[0.1, 0.6, 0.2])
            table.set_fontsize(15)
            table.scale(0.5, 2)
            # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
            # set axis label
            ax.set_ylabel('Importance', fontsize=20)
            ax.set_xlabel('Auxiliary channel', fontsize=20)
            ax.grid(False)
            return None


[docs]    def PlotOccurrenceVSChannel(self, MatCount, ListChannelName, ListGPS, ListDuration, output_dir, output_file):
        '''
        description:
            plot glitch indecies versus channels
        USAGE: PlotOccurrenceVSChannel(MatCount, ListChannelName, ListGPS, ListDuration, output_dir, output_file)

        :param MatCount: a matrix comprising importance versus channels
        :param ListChannelName: a list of channel names
        :param ListGPS: a list of GPS times
        :param ListDuration: a list of durations
        :param output_dir:
        :param output_file:
        :return: None
        dependencies: CreateChannelTicks()
        '''
        ##########
        #MatCount = MatCount[:, ::-1]
        #ListChannelName = ListChannelName[::-1]
        #############3
        if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
            print('There is only one single glitch. Skip the plot of glitch index VS channels')
            return None
        # color the background either with blakc or gray for each channel
        Listcolor = ['black', 'gray'] * 20 * 7
        # get xticks for dominant channels
        CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
        fig = plt.figure()
        # make grid for subplots
        gs = GridSpec(9, 30)
        # a subplot showing channel position
        ax1 = fig.add_subplot(gs[:1, :-1])  # First row, up to the second last column
        for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
            if edgeRight == MatCount.shape[-1] - 1: # adjust the right edge of the last color span
                edgeRight = edgeRight + 1
            ax1.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
        # limit the xaxis of the first subplot
        ax1.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
        # get rid of ticks of the first plot
        ax1.yaxis.set_major_locator(plt.NullLocator())
        ax1.xaxis.set_major_locator(plt.NullLocator())
        # assign a size for the second subplot
        ax2 = fig.add_subplot(gs[1:, :-1])
        #MatCount = np.log10(MatCount+1)
        # CMAP = 'viridis'
        CMAP = 'OrRd' # change the color map
        ax2.imshow(MatCount, aspect='auto', interpolation='nearest', cmap=getattr(plt.cm, CMAP), origin='upper')
        ax2.set_xticks(CenterTicks)
        ax2.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
        ax2.set_xticklabels(ListSubsys[ListInd.astype(int)])
        ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10)
        ax2.tick_params(axis="y", labelsize=10)
        ax2.grid(False)
        ax2.set_ylabel('Glitch index', fontsize=15)
        ax2.set_xlabel('Auxiliary channel', fontsize=15)
        ###### Make a color bar
        cmap = getattr(matplotlib.cm, CMAP)
        norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max())
        ax3 = fig.add_subplot(gs[1:, -1])
        cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
        ax3.grid(False)
        cbar.set_label(r'$q$', size=15)
        # cbar.set_label('Importance', size=10)
        ########
        fig.subplots_adjust(hspace=0)
        output_path = os.path.join(output_dir, output_file)
        fig.savefig(output_path)
        plt.close()
        print('Completed Saving a plot of Glitch occurrence VS channel')
        #-----------------------------------------------------------------+
        # make .csv file comprising GPS, importance of all the channels   |
        #-----------------------------------------------------------------+
        # concatenate a list of GPS times and importance of all the channels
        MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), MatCount), axis=1)
        # make a list of column names
        columnName = np.append(np.array(['GPS']), np.array(['duration']))
        columnName = np.append(columnName, ListChannelName)
        # convert it to a pandas frame
        dfGPSDurationImportanceChannels = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName)
        path_output_fileGPSImportanceChannels = os.path.join(output_dir, 'GPSDurationImportanceChannels.csv')
        # save it as .csv file
        dfGPSDurationImportanceChannels.to_csv(path_output_fileGPSImportanceChannels)
        print('Completed saving a table comprising GPS times, durations, and importance of the channels')
        return None

[docs]    def MakeMatrixOccurrenceVSChannel(self, MatCount, ListChannelName, ListGPS, ListDuration, output_dir):
        '''
        description:
            save matrix of GPS, duration, importance as .csv file
        USAGE: PlotOccurrenceVSChannel(MatCount, ListChannelName, ListGPS, ListDuration, output_dir, output_file)

        :param MatCount: a matrix comprising importance versus channels
        :param ListChannelName: a list of channel names
        :param ListGPS: a list of GPS times
        :param ListDuration: a list of durations
        :param output_dir:
        :return: None
        dependencies: CreateChannelTicks()
        '''
        if MatCount.shape[0] == MatCount.shape[-1]:  # if only a glitch, this plot is not useful
            MatCount = MatCount.reshape(1, MatCount.shape[0]) # make row vector
        #-----------------------------------------------------------------+
        # make .csv file comprising GPS, importance of all the channels   |
        #-----------------------------------------------------------------+
        # concatenate a list of GPS times and importance of all the channels
        MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), MatCount), axis=1)
        # make a list of column names
        columnName = np.append(np.array(['GPS']), np.array(['duration']))
        columnName = np.append(columnName, ListChannelName)
        # convert it to a pandas frame
        dfGPSDurationImportanceChannels = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName)
        path_output_fileGPSImportanceChannels = os.path.join(output_dir, 'GPSDurationImportanceChannels.csv')
        # save it as .csv file
        dfGPSDurationImportanceChannels.to_csv(path_output_fileGPSImportanceChannels)
        print('Completed saving a table comprising GPS times, durations, and importance of the channels')
        return None

[docs]    def PlotTimeVSChannel(self, MatCount, ListChannelName, ListGPS, output_dir, output_file, startT, endT, dt):
        '''
        description:
            make a plot of glitch times versus channels where the time flow is from top to bottom
            If there are more than one glitch in a time bin, those gltiches' values of importance are averaged
            If there is no glitches in a time bin, all the values of importance are set to be zero
        dependencies: CreateChannelTicks()
        USAGE: PlotTimeVSChannel(MatCount, ListChannelName, output_dir, output_file, startT, endT, dt)

        :param MatCount: a matrix comprising importance versus channels
        :param ListChannelName: a list of channel names
        :param ListGPS: a list of GPS times
        :param output_dir: a path to an output directory
        :param output_file: a name of an output file
        :param startT: start time of an epoch
        :param endT: end time of an epoch
        :param dt: step size in sec of the time slice
        :return: None
        '''
        if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
            print('There is only one single glitch. Skip the plot of glitch index VS channels')
            return None
        # ################
        # ListChannelName = np.array(df.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist())
        # ListGPS = df.GPS
        # dt = 60
        # MatCount = df.drop(['Unnamed: 0', 'GPS'], axis=1).values
        # #######################
        TimeSlice = np.arange(0, endT - startT, dt) # time steps from the beginning of the epoch with the step size of dt in
        # make a matrix with all the elements equal to zero, the size of this matrix is # of the time slice in row , # of channels in column
        TimeSlice_MatCount = np.zeros((TimeSlice.shape[0], MatCount.shape[-1]))
        # convert the GPS times of the sample to the time since the beginning of the epoch
        Time_sample = ListGPS - startT
        # fill the values of the importance of the sample into the time slices
        for ind in range(len(TimeSlice)): # iterate though the time slice
            # find the index of the samples that are within this time slice
            index_within = np.where((Time_sample >= TimeSlice[ind]) & (Time_sample < TimeSlice[ind] + dt))[0]
            if index_within.size > 0: # if more than one sample is in this time slice
                # take the average of the values of importance of the samples in this time slice, channel by channel
                TimeSlice_MatCount[ind, :] = MatCount[index_within[0]:index_within[-1]+1, :].mean(axis=0)
            else: # if no samples is in this time slice, go to the next iteration
                pass
        # color the background either with blakc or gray for each channel
        Listcolor = ['black', 'gray'] * 20 * 7
        # get xticks for dominant channels
        CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
        fig = plt.figure()
        # make grid for subplots
        gs = GridSpec(9, 30)
        # a subplot showing channel position
        ax1 = fig.add_subplot(gs[:1, :-1])  # First row, up to the second last column
        for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
            if edgeRight == MatCount.shape[-1]: # adjust the right edge of the last color span
                edgeRight = edgeRight + 1
            ax1.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
        # limit the xaxis of the first subplot
        ax1.axis(xmin=0, xmax=TimeSlice_MatCount.shape[-1] * 1.001)
        # get rid of ticks of the first plot
        ax1.yaxis.set_major_locator(plt.NullLocator())
        ax1.xaxis.set_major_locator(plt.NullLocator())
        # assign a size for the second subplot
        ax2 = fig.add_subplot(gs[1:, :-1])
        #MatCount = np.log10(MatCount+1)
        ax2.imshow(TimeSlice_MatCount, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper')
        ax2.set_xticks(CenterTicks)
        ax2.set_xticklabels(ListSubsys[ListInd.astype(int)])
        ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10)
        ax2.tick_params(axis="y", labelsize=10)
        ax2.grid(False)
        ax2.axis(xmin=0, xmax=TimeSlice_MatCount.shape[-1] * 1.001)
        start_epoch_UTC = tconvert(startT) # make gwpy object of UTC time
        # set y label
        if dt == 60:
            unit_time = 'mins'
        elif dt == 3600:
            unit_time = 'hours'
        elif dt == 1:
            unit_time = 'secs'
        ax2.set_ylabel('Time [{0}] since {1}-{2:02d}-{3:02d} {4:02d}:{5:02d}:{6:02d} UTC'.format(unit_time, start_epoch_UTC.year, start_epoch_UTC.month, start_epoch_UTC.day, start_epoch_UTC.hour, start_epoch_UTC.minute, start_epoch_UTC.second), fontsize=15)
        ax2.set_xlabel('Auxiliary channel', fontsize=15)
        ###### Make a color bar
        cmap = matplotlib.cm.viridis
        norm = matplotlib.colors.Normalize(vmin=TimeSlice_MatCount.min(), vmax=TimeSlice_MatCount.max())
        ax3 = fig.add_subplot(gs[1:, -1])
        cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
        ax3.grid(False)
        cbar.set_label('Importance', size=10)
        ########
        fig.subplots_adjust(hspace=0)
        output_path = os.path.join(output_dir, output_file)
        fig.savefig(output_path)
        plt.close()
        print('Completed Saving a plot of time VS channel')
        return None



[docs]    def PlotSNRVSChannel(self, MatCount, ListChannelName, ListSNR, output_dir, output_file):
        '''
        description:
            make a plot of SNR of h(t) versus channels
        dependencies: CreateChannelTicks()
        USAGE: PlotSNRVSChannel(self, MatCount, ListChannelName, ListSNR, output_dir, output_file)

        :param MatCount: a matrix comprising importance versus channels
        :param ListChannelName: a list of channel names
        :param ListSNR: a list SNRs
        :param output_dir:
        :param output_file:
        :return: None
        '''
        if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
            print('There is only one single glitch. Skip the plot of SNR VS channels')
            return None
        # color the background either with blakc or gray for each channel
        Listcolor = ['black', 'gray'] * 20 * 7
        # get xticks for dominant channels
        CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
        fig = plt.figure()
        # make grid for subplots
        gs = GridSpec(9, 30)
        # a subplot showing channel position
        ax1 = fig.add_subplot(gs[:1, :-1])
        for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
            if edgeRight == MatCount.shape[-1] - 1: # adjust the right edge of the last color span
                edgeRight = edgeRight + 1
            ax1.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
        # limit the xaxis of the first subplot
        ax1.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
        # get rid of ticks of the first plot
        ax1.yaxis.set_major_locator(plt.NullLocator())
        ax1.xaxis.set_major_locator(plt.NullLocator())
        # assign a size for the second subplot
        ax2 = fig.add_subplot(gs[1:, :-1])  # First row, first column
        # sort based on SNR values
        MatCountSortedSNR = np.array([MatCount[i, :] for i in np.argsort(ListSNR)[::-1]])
        #MatCountSortedSNR = np.log10(MatCountSortedSNR+1)
        ax2.imshow(MatCountSortedSNR, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper')
        ax2.set_xticks(CenterTicks)
        ax2.set_xticklabels(ListSubsys[ListInd.astype(int)])
        ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10)
        ax2.tick_params(axis="y", labelsize=10)
        ax2.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
        ax2.grid(False)
        fig.subplots_adjust(hspace=0)
        y_ticks = np.sort(ListSNR)[::-1] # descending values of SNR
        # take dominant dicks, e.g, when the first digit changes
        ListYticksNew = [0]
        ListYTicksOld = []
        SNR_range = y_ticks.max() - y_ticks.min() # the range of SNR
        if SNR_range > 1000: # if the range of SNR is large (e.g., 100)
            RoundPoint = -2 # round up at 100th
        elif SNR_range > 100:
            RoundPoint = -1 # round up at 10th
        else:
            RoundPoint = 0 #  round up at 1th
        for ind, i in enumerate(np.round(y_ticks, RoundPoint), 0): # iterate over rounded numbers of SNR
            if ListYticksNew[-1] != i: # if difference changes compared with the previous value
                ListYticksNew.append(i)
                ListYTicksOld.append(ind)
        ListYticksNew = ListYticksNew[1:]
        ax2.set_yticks(ListYTicksOld)
        ax2.set_yticklabels(ListYticksNew)
        ax2.set_ylabel('SNR of h(t)', fontsize=15)
        ax2.set_xlabel('Auxiliary channel', fontsize=15)
        ###### Make a color bar
        cmap = matplotlib.cm.viridis
        norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max())
        ax3 = fig.add_subplot(gs[1:, -1])
        cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
        ax3.grid(False)
        cbar.set_label('Importance', size=10)
        ########
        output_path = os.path.join(output_dir, output_file)
        fig.savefig(output_path)
        plt.close()
        print('Completed Saving a plot of SNR VS channel')
        return None

[docs]    def PlotConfidenceVSChannel(self, MatCount, ListChannelName, ListConf, output_dir, output_file):
        '''
        make a plot of values of confidence level of Gravity Spy versus channels
        dependencies: CreateChannelTicks()
        USAGE: PlotConfidenceVSChannel(MatCount, ListChannelName, ListConf, output_dir, output_file)

        :param MatCount: a matrix comprising importance versus channels
        :param ListChannelName: a list of channel names
        :param ListConf: a list confidence levels
        :param output_dir:
        :param output_file:
        :return: None
        '''
        if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
            print('There is only one single glitch. Skip the plot of confidence level VS channels')
            return None
        # color the background either with blakc or gray for each channel
        Listcolor = ['black', 'gray'] * 20 * 7
        # get xticks for dominant channels
        CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
        fig = plt.figure()
        # make grid for subplots
        gs = GridSpec(9, 30)  # 9 rows, 25 columns
        # a subplot showing channel position
        ax1 = fig.add_subplot(gs[:1, :-1])  # First row, first column
        for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
            if edgeRight == MatCount.shape[-1] - 1: # adjust the right edge of the last color span
                edgeRight = edgeRight + 1
            ax1.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
        # limit the xaxis of the first subplot
        ax1.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
        # get rid of ticks of the first plot
        ax1.yaxis.set_major_locator(plt.NullLocator())
        ax1.xaxis.set_major_locator(plt.NullLocator())
        # assign a size for the second subplot
        ax2 = fig.add_subplot(gs[1:, :-1])  # First row, first column
        # sort based on SNR values
        MatCountSortedConf = np.array([MatCount[i, :] for i in np.argsort(ListConf)[::-1]])
        #MatCountSortedConf = np.log10(MatCountSortedConf + 1)
        ax2.imshow(MatCountSortedConf, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper')
        ax2.set_xticks(CenterTicks)
        ax2.set_xticklabels(ListSubsys[ListInd.astype(int)])
        ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10)
        ax2.tick_params(axis="y", labelsize=10)
        ax2.grid(False)
        ax2.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
        fig.subplots_adjust(hspace=0)
        y_ticks = np.sort(ListConf)[::-1]
        ListYticksNew = [0]
        ListYTicksOld = []
        # take dominant dicks, e.g, when the first decimal point changes
        for ind, i in enumerate(np.round(y_ticks, 1), 0):
            if ListYticksNew[-1] != i:
                ListYticksNew.append(i)
                ListYTicksOld.append(ind)
        ListYticksNew = ListYticksNew[1:]
        ax2.set_yticks(ListYTicksOld)
        ax2.set_yticklabels(ListYticksNew)
        ax2.set_ylabel('Confidence level', fontsize=15)
        ax2.set_xlabel('Auxiliary channel', fontsize=15)
        ###### Make a color bar
        cmap = matplotlib.cm.viridis
        norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max())
        ax3 = fig.add_subplot(gs[1:, -1])
        cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
        ax3.grid(False)
        cbar.set_label('Importance', size=10)
        ########
        output_path = os.path.join(output_dir, output_file)
        fig.savefig(output_path)
        plt.close()
        print('Completed Saving a plot of Confidence VS channel')
        return None


[docs]    def TableImportance(self, MatCount, ListChannelName, output_dir, output_file):
        '''
        make a table of values of importance as a .csv file
        USAGE: TableImportance(MatCount, ListChannelName, output_dir, output_file)

        :param MatCount: a matrix comprising importance versus channels
        :param ListChannelName: a list of channel names
        :param output_dir:
        :param output_file:
        :return: None
        '''
        if MatCount.shape[0] == MatCount.shape[-1]:  # for a matrix with one row
            MatCount = MatCount.reshape(1, MatCount.shape[-1])
        # sum all the counts over glitches
        SumMatCount = np.sum(MatCount, axis=0) / float(MatCount.shape[0])
        # a list of sorted Channel names & sorted Importance
        RankChannel = list(zip(np.argsort(SumMatCount)[::-1], ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]]))
        # make pandas matrix
        Impdf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 'Importance'))
        # add rank label
        Impdf = Impdf.assign(rank=np.arange(1, MatCount.shape[-1] + 1, 1))
        # round importance
        Impdf['Importance'] = Impdf['Importance'].astype(float).round(3)  # = df.value1.round()
        # re-sort columns to make rank labels coming first
        cols = Impdf.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        Impdf = Impdf[cols]
        output_path = os.path.join(output_dir, output_file)
        # save it
        Impdf.to_csv(output_path)
        print('Completed saving a table of importance of channels')
        return None

[docs]    def BinomialDist(self, k, n, likelihood):
        '''
        description: Binomial distribution
        USAGE: out = BinomialDist(self, k, n, likelihood)

        :param k: the number of detections
        :param n: the number of trials
        :param p: likelihood of detection
        :return: out: a value of probability density to find k detection out of n trials
        '''
        #pro_dens = factorial(n) / factorial(k) / factorial(n - k) * likelihood ** k * (1 - likelihood) ** (n - k)
        # use the scipy.stats as the equation above fails for a large value of n, or k
        pro_dens = binom.pmf(k, n, likelihood) # binomial probability mass function
        return pro_dens

[docs]    def BinomialTest(self, k, N, rate):
        '''
        description: compute p-value of one-tailed Binomial test against null rate
        USAGE: p_value = BinomialTest(k, N, rate)

        :param k: observed numberof successes
        :param N: total number of samples
        :param rate: rate of successes drawn from a null hypothesis
        :return: p_value: probablity of successes equal or greater than the observed number of successes
        '''
        #----  the manual calculate is much slower than scipy.stats.binom_test so that it is replaced
        #list_k = np.linspace(k, N, N - k + 1)
        #list_pro = np.vectorize(self.BinomialDist)(list_k, N, rate)
        #p_value = list_pro.sum()
        #-----------------------------
        if k/float(N) >= rate: # one-sided binomial test condition applied
            p_value = binom_test(k, N, rate)/2. # use scipy.stats
        else: # one-sided binomial test condition applied
            p_value = 1 - binom_test(k, N, rate)/2. # use scipy stats
        return p_value

[docs]    def query_targetglitch_null(self, path_target_glitch_dataset, path_null_dataset):
        '''
        description:
            load files, otherwise, end the program

        :param path_target_glitch_dataset: a path to .csv file of a target gltich class
        :param path_null_dataset: a path to .csv file of a null dataset
        :return:
            df1: a pandas dataframe of a target glitch class
            df0: a pandas datafram of a null dataset
        '''
        try:
            df1 = pd.read_csv(path_target_glitch_dataset)
        except IOError:
            print('No such a file {}'.format(path_target_glitch_dataset))
            sys.exit()
        try:
            df0 = pd.read_csv(path_null_dataset)
        except IOError:
            print('No such a file {}'.format(path_null_dataset))
            sys.exit()
        return df1, df0

[docs]    def Student_t_independet_test(self, data1, data2, Welch_test=True):
        '''
        Independent Student t-test
        USAGE: stat, p_value = self.Student_t_independet_test(data1, data0)

        :param data1: a population
        :param data2: another population
        :return:
            t_value: t-value (critical value)
            p: p-value
        '''
        if data1.std(ddof=1) + data2.std(ddof=1) == 0: # if the sum of the variances are zero, t-value will be infinity
            t_value = 0 # manually set zero
            p_value = 1 # manually set 1
            return t_value, p_value
        if Welch_test == True:
            t_value, p_value = ttest_ind(data1, data2, equal_var=False)
        else:
            t_value, p_value = ttest_ind(data1, data2)
        return t_value, p_value


[docs]    def chisquare_test(self, target_mat, null_mat):
        '''
        description:
            calculate the values of chi-square of each channel in each glitch
            and put out values of chi-square and corresponding p-values
        USAGE: chi2_value, p_value = chisquare_test(self, target_mat, null_mat)

        :param target_mat: a matrix of target samples where glitch indices are in rows and channels are in columns
        :param null_mat: a matrix of null samples where glitch indices are in rows and channels are in columns
        :return:
            chi2_value: a matrix of chi-square values where glitch indices are in rows and channels are in columns
            p_value: a matrix of p-values where glitch indices are in rows and channels are in columns
        '''
        # degrees of freedom is 1 for a single point chi-square test
        ddof = 1
        mean_null = null_mat.mean(axis=0)  # calcualte the mean of null samples channel by channel
        std_null = null_mat.std(ddof=ddof,
                                axis=0)  # calculate the standard deviation of the null samples channel by channel
        # chi-square values
        chsqr_values = ((target_mat - mean_null) / std_null) ** 2
        # filter (correct)
        # when std_null is zero
        # chi-square value will nan when the nominator is zero (it's supposed to be zero)
        # chi-square value will be infinite when the nominator is non-zero
        where_are_NaNs = np.isnan(chsqr_values)  # indices where elements are nan
        chsqr_values[where_are_NaNs] = 0  # replace with zero
        where_are_inf = np.isposinf(chsqr_values)  # indices where elements are positive infinite
        chsqr_values[where_are_inf] = 0  # replace with zero (forcefully set even if not correct)
        # calculate p-value of the right-sided  chi-square test
        p_values = 1 - chi2.cdf(chsqr_values, ddof)
        return chsqr_values, p_values

[docs]    def find_channels(self, df_target):
        '''
        description:
            find a list of channels from the target samples
        USAGE: channels = find_channels(df_target)

        :param df_target: target samples in pandas frame
        :return: channels: a list of channels
        '''
        try:
            channels = np.array(df_target.drop(['Unnamed: 0', 'GPS', 'duration'], axis=1).columns.tolist()) # new format of the table
        except ValueError:
            channels = np.array(df_target.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist()) # old format of the table
        return channels


[docs]    def Calculate_number_and_rate(self, df_target, df_null, d_c, err_cal, channels):
        '''
        description:
            1. use the target and null samples
            2. calculate the numbers of the target and null samples above threshold
            3. calculate the fraction of the target and null samples above threshold
        USAGE: channels, list_num1, list_num0, total_sample1, total_sample0, list_Causal, list_causal_err = Calculate_number_and_rate(df_target, df_null, d_c=None, err_cal=False, channels=None)

        :param df_target: target samples in pandas frame
        :param df_null: null samples in pandas frame
        :param d_c: a threshold. if it is None, the threshold is the mean value of null samples
        :param err_cal: boolen, wether calculate the error of the statistics or not
        :param channels: a list of channels
        :return:
            channels: a list of channels
            list_num1: a list of the number of the target samples above a threshold
            list_num0: a list of the number of the null samples above a threshold
            total_sample1: the number of the target samples, float
            total_sample0: the number of the null samples, float
            list_Causal: a list of the statistics
            list_causal_err: a list of the errors of the statistics. It is None if err_cal = False
        '''
        if type(channels) == int: # channels are not provided, it uses all the channels in the target samples
            # find a list of channels from the target samples
            channels = self.find_channels(df_target)
        else:
            pass


        list_num1 = np.array([])  # a list of a number of samples of target glitches that have values greater than the detection threshold
        list_num0 = np.array([])  # a list of a number of samples of dummpy quite dataset that have values greater than the detection threshold
        list_Causal = np.array([])  # a list of the causal probability
        list_causal_err = np.array([]) # a list of the error of the causal probability
        total_sample1 = float(df_target.shape[0])  # total number of target glitches
        total_sample0 = float(df_null.shape[0])  # total number of dummy quite samples
        for ch in channels:  # iterate though channels
            # calculate the threshold of claiming detection of a glitch
            if d_c == None:
                d_c = df_null['{}'.format(ch)].mean()
            else:
                pass
            # calculate the number of samples that have values of importance greater than the detection threshold
            num_d1 = df_target[df_target['{}'.format(ch)] > d_c].shape[0]
            num_d0 = df_null[df_null['{}'.format(ch)] > d_c].shape[0]
            # calculate the fraction samples that have values greater than the detection threshold
            rate1 = num_d1 / total_sample1
            rate0 = num_d0 / total_sample0
            # calculate the probability of the causality
            try:
                Causal = rate1 / (rate1 + rate0)
            except ZeroDivisionError:  # if the denominator is zero, manually set it zero (it is ok, this channel should not pass Binomial test)
                Causal = 0
            if err_cal == True:
                # calculate the error of the causal probability
                err1 = np.sqrt(rate1 * (1 - rate1) / float(total_sample1)) # the error of true positive based on multinominal distribution
                err0 = np.sqrt(rate0 * (1 - rate0) / float(total_sample0)) # the error of false positive based on multinominal distribution
                try:
                    causal_err = (rate1 + rate0) **(-2) * np.sqrt(rate0**2 * err1**2 + rate1**2 + err0**2) # based on the error propagation
                except ZeroDivisionError:
                    causal_err = 0
            else:
                pass
            # fill the values
            list_num1 = np.append(list_num1, num_d1)
            list_num0 = np.append(list_num0, num_d0)
            list_Causal = np.append(list_Causal, Causal)
            if err_cal == True:
                list_causal_err = np.append(list_causal_err, causal_err)
            else:
                list_causal_err = None
        return channels, list_num1, list_num0, total_sample1, total_sample0, list_Causal, list_causal_err


[docs]    def find_meaing_ful_confidence(self, df1, df0, BinomialTestConfidence, d_c=None, err_cal=False, channels=0):
        '''
        description
            1. load files of target glitches (df1) and dummy quite data set (df0)
            2. compute true positive probability
            3. perform one-tailed Binomial test
        USAGE: list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, list_channels= find_meaing_ful_confidence(df1, df0, BinomialTestConfidence, d_c)

        :param df1: target glitches in the pandas format
        :param df0: null dataset in the pandas format
        :param BinomialTestConfidence: a confidence level used for one-tailed Binomial test
        :param d_c: user defined threshold to claim detection of a glitch, None in default
        :param channels: a list of channels, 0 in default
            if d_c is None, values of threshold is given by the mean value of importance generated by the dummy quite dataset
        :return:
            list_Causal_passed: a list of the probability of the causality that are passed one-tailed Binomial test, otherwise, zero
            list_Causal_fail: a list of the probability of the causality that are failed one-tailed Binomial test, otherwise, zero
            list_causal_passed_err: a list of the error of the causal probability that are passed one-tailed Binomial test, otherwise, zero
            list_causal_failed_err: a list of the error of the causal probability that are failed one-tailed Binomial test, otherwise, zero
            list_Test: a list of results of the Binomial test, 'pass' or 'fail'
            channels: a list of channel names
        '''
        channels, list_num1, list_num0, total_sample1, total_sample0, list_Causal, list_causal_err = self.Calculate_number_and_rate(df1, df0, d_c=d_c, err_cal=err_cal, channels=channels)
        list_rate0 = list_num0 / total_sample0 # a list of the fractions of the null samples above the threshold
        list_Causal_passed = np.zeros(list_Causal.shape[0]) # initial values are zero
        if err_cal == True: # if the error of the causal probability is calculated
            list_causal_passed_err = np.zeros(list_causal_err.shape[0])  # initial values are zero
        else: # other wise, it is None
            list_causal_passed_err = None
        list_Test = np.array([])
        # perform Binomial test
        for index in range(len(channels)):  # iterate though the index of channels
            num1 = list_num1[index]  # the number of times when this channel shows values greater than the detection threshold
            rate0 = list_rate0[index]  # the fraction of samples of the dummy quite dataset shows values greater than the detection threshold
            # compute p-values of one-tailed Binomial test against null hypothesis
            p_value = self.BinomialTest(num1, total_sample1, rate0)
            if p_value < (1 - BinomialTestConfidence):  # if pass {BinomialTestConfidence}% confidence level
                list_Causal_passed[index] = list_Causal[index]  # fill the probability of the causality. If a channel passes the test, the causal probability is non-zero value, otherwise, zero. This process is needed for the plottting
                if err_cal == True:
                    list_causal_passed_err[index] = list_causal_err[index] # fill the error of the probability of the causality.
                else:
                    pass
                list_Test = np.append(list_Test, 'pass') # fill the mark of pass
            else:
                list_Test = np.append(list_Test, 'fail') # fill the mark of fail
        list_Causal_fail = list_Causal - list_Causal_passed  # a list of the probability of the causality that failed the one-tailed Binomial test
        if err_cal == True: # if the error of the causal probability is calculated
            list_causal_failed_err = list_causal_err - list_causal_passed_err  # a list of the error of the probability of the causality that failed the one-tailed Binomial test
        else: # otherwise, it is None
            list_causal_failed_err = None
        return list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, channels


[docs]    def PlotCausalityVSChannel(self, list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, ListChannelName, output_dir, output_file, BinomialTestConfidence, freq_bands=Const.freq_bands):
        '''
        description:
            Calculate the probabilities of the causality of channels
        USAGE: PlotCausalityVSChannel(list_Causal_passed, list_Causal_fail, list_Test, ListChannelName, output_dir, output_file, BinomialTestConfidence)

        :param list_Causal_passed: a list of the probability of the causality that are passed one-tailed Binomial test, if not zero
        :param list_Causal_fail: a list of the probability of the causality that are failed one-tailed Binomial test, if not zero
        :param list_causal_passed_err: a list of the error of the causal probability that are passed one-tailed Binomial test, otherwise, zero
        :param list_causal_failed_err: a list of the error of the causal probability that are failed one-tailed Binomial test, otherwise, zero
        :param list_Test: a list of results of the Binomial test, 'pass' or 'fail'
        :param ListChannelName: a list of channel names
        :param output_dir: (only used for all glitches)
        :param output_file: (only used for all glitches)
        :param BinomialTestConfidence: binomial test confidence level
        :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
        :return: None
        '''
        CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannelName)
        # color the background either with black or gray for each channel
        Listcolor = ['black', 'gray'] * 20 * 7
        for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
            if edgeRight == list_Causal_passed.shape[-1] - 1:  # fix the background color to match plot size
                edgeRight = edgeRight + 1
            ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
            ax.set_edgecolor(None)
        # ++++++++  plot bar plot about the probability of the causality that pass or fail the Binomial test and ++++++++++++++++
        #plt.bar(np.arange(list_Causal_passed.shape[-1]), list_Causal_passed, 1, yerr=list_causal_passed_err, align='center', edgecolor='blue', color='blue', linewidth=0.)
        #plt.bar(np.arange(list_Causal_fail.shape[-1]), list_Causal_fail, 1, yerr=list_causal_failed_err, align='center', edgecolor='red', alpha=0.3, color='red', linewidth=0.)
        plt.bar(np.where(list_Causal_fail != 0)[0], list_Causal_fail[list_Causal_fail != 0], 1, align='edge', edgecolor=None, color='lightblue', alpha=0.3, linewidth=0.)
        plt.bar(np.where(list_Causal_passed != 0)[0], list_Causal_passed[list_Causal_passed != 0], 1, align='edge', edgecolor=None, color='blue', linewidth=0.)
        # +++++++++++ plot a table of ranking channels +++++++++++++++++++
        # a list of the probability of the causality
        list_Causal = list_Causal_passed + list_Causal_fail
        # list of the error of the probability of the causality
        # list_causal_err = list_causal_passed_err + list_causal_failed_err
        # sort channel based on the probability of the causality
        if ListChannelName[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
            # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
            list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
            # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
            # A list of the corresponding frequency bands are created
            # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
            # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
            list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), ListChannelName.shape[0], axis=1).flatten(order='F')
            # get rid of the BD# at the end of the channel name to make it original name
            ListChannelName = np.array([ChannelName.rsplit("_BP", 1)[0] for ChannelName in ListChannelName])
            # sort the links based on values of statistic
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test)
            RankChannel = list(zip(ListChannelName[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'WRS', 'Test{}'.format(BinomialTestConfidence)))
            # define the table size
            ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
        else:
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test)
            RankChannel = list(zip(ListChannelName[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'WRS', 'Test{}'.format(BinomialTestConfidence)))
            # define the table size
            ColWidths = [0.05, 0.6, 0.1, 0.15]
        # add rank label
        Prodf = Prodf.assign(rank=np.arange(1, list_Causal_passed.shape[-1] + 1, 1))
        # round importance
        Prodf['WRS'] = Prodf['WRS'].astype(float).round(3)  # = df.value1.round()
        # re-sort columns to make rank labels coming first
        cols = Prodf.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        Prodf = Prodf[cols]
        # take the first 5 high ranking channels
        dcsummary = Prodf.iloc[:5]
        # plot table on the top of the bar plot
        # table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=[0.1, 0.6, 0.2])
        table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
        table.auto_set_font_size(False)
        table.set_fontsize(6)
        # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
        # ++++++++++++++  alter ticks +++++++++++++++++
        # plot xticks using the dominant sub sensor names
        plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
        # put ticks above
        plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
        plt.tick_params(axis="y", labelsize=10)
        plt.grid(False)
        plt.xlim(0, ListSubsys.shape[0])
        # set axis label
        plt.ylabel('Witness ratio statistic', fontsize=15)
        plt.xlabel('Auxiliary channel', fontsize=15)
        plt.tight_layout(rect=[0, 0.03, 1, 0.8])
        # save a plot
        output_path = os.path.join(output_dir, output_file)
        plt.savefig(output_path)
        plt.close()
        print('Completed Saving a plot of the witness ratio statisti VS channel')
        return None


[docs]    def TableCausality(self, list_Causal_passed, list_Causal_fail, list_Test, ListChannelName, output_dir, output_file, BinomialTestConfidence, freq_bands=Const.freq_bands):
        '''
        make a table of witness ratio statistic (WRS) of channels as a .csv file
        USAGE: TableCausality(self, list_Causal_passed, list_Causal_fail, list_Test, ListChannelName, output_dir, output_file)

        :param list_Causal_passed: a list of the probability of the causality that are passed one-tailed Binomial test, if not zero
        :param list_Causal_fail: a list of the probability of the causality that are failed one-tailed Binomial test, if not zero
        :param list_Test: a list of results of the Binomial test, 'pass' or 'fail'
        :param ListChannelName: a list of channel names
        :param output_dir:
        :param output_file:
        :param BinomialTestConfidence: binomial test confidence level
        :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
        :return: None
        '''
        # a list of the probabilit of the causality
        list_Causal = list_Causal_passed + list_Causal_fail
        if ListChannelName[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
            # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
            list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
            # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
            # A list of the corresponding frequency bands are created
            # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
            # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
            list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), ListChannelName.shape[0], axis=1).flatten(order='F')
            # get rid of the BD# at the end of the channel name to make it original name
            ListChannelName = np.array([ChannelName.rsplit("_BP", 1)[0] for ChannelName in ListChannelName])
            # sort the links based on values of statistic
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test)
            RankChannel = list(zip(ListChannelName[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'WRS', 'Test{}'.format(BinomialTestConfidence)))
        else:
            # sort the links based on values of statistic
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test)
            RankChannel = list(zip(list_sorted_base_index_pass_fail, ListChannelName[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 'WRS', 'Test{}'.format(BinomialTestConfidence)))
        # add rank label
        Prodf = Prodf.assign(rank=np.arange(1, list_Causal.shape[-1] + 1, 1))
        # round importance
        Prodf['WRS'] = Prodf['WRS'].astype(float).round(3)  # = df.value1.round()
        # re-sort columns to make rank labels coming first
        cols = Prodf.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        Prodf = Prodf[cols]
        output_path = os.path.join(output_dir, output_file)
        # save it
        Prodf.to_csv(output_path, index=False)
        print('Completed saving a table of the witness ratio statistic of channels')
        return None


    # def calculate_FAP(self, df1, df0, mode):
    #     '''
    #     description:
    #         1. calculate false alarm probability (FAP)
    #     USAGE: df_FAP = calculate_FAP(df1, df0, mode='indiv')
    #
    #     :param df1: a target smaples in pandas frame
    #     :param df0: a null samples in pandas frame
    #     :param mode: a mode to calculate FAP. 'whole' calculates FAP of the average value of importance of glitches 'indiv' calculates FAP of each glitch in each channel
    #     :return:
    #         if mode == 'whole':
    #             list_FAP: numpy array of FAPs of channels
    #             list_FAP_err: numpy array of errors of FAP of channels
    #             channels: a list of channels
    #             USAGE: list_FAP, list_FAP_err = calculate_FAP(df1, df0, mode='whole')
    #         elif mode == 'indiv':
    #             df_FAP: a matrix of FAP of each glitch in each channel in pandas frame
    #             channels: a list of channels
    #     '''
    #     try:
    #         channels = np.array(df1.drop(['Unnamed: 0', 'GPS', 'duration'], axis=1).columns.tolist())
    #     except ValueError:
    #         channels = np.array(df1.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist())
    #     if mode == 'whole': # calculate false alarm probablity (FAP) of the average value of importance of target glitches
    #         list_FAP = np.array([])
    #         list_FAP_err = np.array([])
    #         total_sample0 = df0.shape[0]  # total number of dummy quite samples
    #         for ch in channels:  # iterate though channels
    #             # calculate the threshold of claiming detection of a glitch
    #             d_c = df1['{}'.format(ch)].mean()
    #             # calculate the number of samples that have values of importance greater than the detection threshold
    #             num_d0 = df0[df0['{}'.format(ch)] > d_c].shape[0]
    #             # calculate the fraction samples that have values greater than the threshold, FAP
    #             FAP = num_d0 / float(total_sample0)
    #             # calculate the probability of the causality
    #             FAP_err = np.sqrt(FAP * (1 - FAP) / float(total_sample0))  # the error of false positive based on multinominal distribution
    #             # fill the values
    #             list_FAP = np.append(list_FAP, FAP)
    #             list_FAP_err = np.append(list_FAP_err, FAP_err)
    #         return list_FAP, list_FAP_err, channels
    #     if mode == 'indiv':
    #         df_FAP = pd.DataFrame()
    #         total_sample0 = float(df0.shape[0])  # total number of dummy quite samples
    #         for ch in channels:  # iterate though channels
    #             # calculate FAP of each glitch for this channel
    #             # note that d_c is measured importance of this glitch in this channel
    #             # note that [df0[df0['{}'.format(ch)] > d_c].shape[0]/total_sample0 is a fraction of null samples above d_c, which is FAP
    #             FAP = np.array([df0[df0['{}'.format(ch)] > d_c].shape[0]/total_sample0 for d_c in df1['{}'.format(ch)]])
    #             df_FAP['{}'.format(ch)] = FAP
    #         return df_FAP, channels


[docs]    def calculate_fap(self, target_mat, null_mat):
        '''
        description:
            calculate the values of fap of each channel in each glitch
        USAGE: faps = calculate_fap(target_mat, null_mat)

        :param target_mat: a matrix of target samples where glitch indices are in rows and channels are in columns
        :param null_mat: a matrix of null samples where glitch indices are in rows and channels are in columns
        :return: fap
        '''

        list_mean_null = null_mat.mean(axis=0)  # calcualte the mean of null samples channel by channel
        list_var_null = null_mat.var(ddof=1, axis=0)  # calculate the standard deviation of the null samples channel by channel
        list_zero_mean_null_index = np.where(list_mean_null == 0)[0] # find the indecies where the mean of the null equal to zero
        list_zero_var_null_index = np.where(list_var_null == 0)[0] # find the indecies where the variance of the null equal to zero
        list_both_zero_mean_var_null_index = np.intersect1d(list_zero_mean_null_index, list_zero_var_null_index)
        list_a_null = np.array([mean_null ** 2 * (1. - mean_null) / var_null - mean_null if mean_null != 0 and var_null != 0 else -1 for mean_null, var_null in zip(list_mean_null, list_var_null)]) # the parameter of beta dist
        list_b_null = np.array([a_null * (1 - mean_null) / mean_null if mean_null != 0 else -1 for a_null, mean_null in zip(list_a_null, list_mean_null)])  # the parameter of beta dist
        # if the mean and the variance of the null are both zero, the parameters are zero
        # the null distribution is a delta function at zero
        # then, the FAP will be NaN
        # to fix this issue, I manually set the beta distribution which are steap at zero
        list_a_null[list_both_zero_mean_var_null_index] = 0.01
        list_b_null[list_both_zero_mean_var_null_index] = 100
        # when mean_null - mean_null **2 - var_null is negative a_null will be negative, which is not allowed in beta distribution
        # but in this case, mean_null and var_null are both very small value, but there are a few samples with values of importance are large
        # This means it looks like superposition of two delta functions at 0 and 1
        # so I set a_null = 0.01 manually
        # the re-calculate b_null
        list_index_negative_a_null = np.where(list_a_null < 0)[0]
        list_a_null[list_index_negative_a_null] = 0.01
        list_b_null[list_index_negative_a_null] = list_a_null[list_index_negative_a_null] * (1 - list_mean_null[list_index_negative_a_null]) / list_mean_null[list_index_negative_a_null]
        rv_null = beta(list_a_null, list_b_null)
        faps = 1 - rv_null.cdf(target_mat)
        return faps


[docs]    def calculate_reweighted_importance(self, df1, df_FAP):
        '''
        description:
            reweight the values of importance using FAP
            the reweighted importance is defined as rho_new = rho / (FAP + 1)
        USAGE: df1_new = calculate_reweighted_importance(df1, df_FAP)

        :param df1: a target samples in pandas frame
        :param df_FAP: a FAP matrix in pandas frame
        :return: df1_new: reweighted importance matrix in pandas frame
        '''
        try:
            channels = np.array(df1.drop(['Unnamed: 0', 'GPS', 'duration'], axis=1).columns.tolist())
        except ValueError:
            channels = np.array(df1.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist())
        mat1 = df1[channels].values
        mat_FAP = df_FAP[channels].values
        mat1_new = mat1 / (mat_FAP + 1)
        df1_reweighed = pd.DataFrame(mat1_new, columns=channels)
        df1_new = df1.drop(channels, axis=1)
        df1_new = pd.concat([df1_new, df1_reweighed], axis=1)
        return df1_new



[docs]    def perform_Welch_test(self, df_target, df_null, confidence_level, channels=0):
        '''
        description:
            perform one-sided Welch t-test
        USAGE: channels, list_t_values_passed, list_t_values_failed, list_Test = perform_Welch_test(df_target, df_null, confidence_level, channels=None)

        :param df_target: target samples in pandas frame
        :param df_null: null samples in pandas frame
        :param confidence_level: a confidence level
        :param channels: a list of channels, 0 in default
        :return
            channels: a list of channels
            list_t_values_passed: a list of t-values that pass the test
            list_t_values_failed: a list of t-vlaues that fail the    test
            list_Test: a list of the test results {'pass', 'fail'}
        '''
        # find a list of channels from the target samples
        if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples
            channels = self.find_channels(df_target)
        else:
            pass
        list_t_values_passed = np.array([])
        list_t_values_failed = np.array([])
        list_Test = np.array([])
        for channel in channels:
            if df_null['{}'.format(channel)].std(ddof=1) == 0: # if the variance of the null samples is zero
                null_samples = np.random.normal(loc=df_null['{}'.format(channel)].mean(), scale=0.01, size=1000)
                t_value, p_value = self.Student_t_independet_test(df_target['{}'.format(channel)], null_samples)
            else:
                t_value, p_value = self.Student_t_independet_test(df_target['{}'.format(channel)], df_null['{}'.format(channel)])
            if p_value / 2. < (1 - confidence_level) and t_value > 0: # one-sided Welch t-test
                list_t_values_passed = np.append(list_t_values_passed, t_value)  # fill t-value if this channel passes the test
                list_t_values_failed = np.append(list_t_values_failed, 0) # fill zero
                list_Test = np.append(list_Test, 'pass')  # fill the mark of pass
            else:
                list_t_values_passed = np.append(list_t_values_passed, 0) # fill zero
                list_t_values_failed = np.append(list_t_values_failed, t_value) # fill t-value if this channel fails the test
                list_Test = np.append(list_Test, 'fail')  # fill the mark of fail
        return channels, list_t_values_passed, list_t_values_failed, list_Test, confidence_level


[docs]    def perform_beta_dist(self, df_target, df_null, channels=0):
        '''
        description:
            create beta distribution fits for target and null samples
        USAGE: rv_t_dict, rv_n_dict = perform_beta_dist(df_target, df_null, channels=0)

        :param df_target: target samples in pandas frame
        :param df_null: null samples in pandas frame
        :param channels: channels (optional)
        :return:
            rv_t_dict: a dictionary of beta distribution (scipy obj) for the target samples
            rv_n_dict: a dictionary of beta distribution (scipy obj) for the null samples
        '''
        # find a list of channels from the target samples
        if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples
            channels = self.find_channels(df_target)
        else:
            pass
        rv_t_dict = {}
        rv_n_dict = {}
        for channel in channels:
            T = df_target[channel].values
            BG = df_null[channel].values
            # for target samples ------
            mean_t = T.mean()
            var_t = T.var(ddof=1)
            if mean_t != 0 and var_t != 0:
                a_t = mean_t ** 2 * (1. - mean_t) / var_t - mean_t
            else:
                a_t = 0.1
            if mean_t != 0:
                b_t = a_t * (1 - mean_t) / mean_t
            else:
                b_t = 100
            if a_t < 0:
                a_t = 0.1
                b_t = a_t * (1 - mean_t) / mean_t
            rv_t = beta(a_t, b_t)
            # fill it in the dictionary
            rv_t_dict[channel] = rv_t
            # for null samples ----------
            mean_null = BG.mean()
            var_null = BG.var(ddof=1)
            if mean_null != 0 and var_null != 0:
                a_null = mean_null ** 2 * (1. - mean_null) / var_null - mean_null
            else:
                a_null = 0.1
            if mean_null != 0:
                b_null = a_null * (1 - mean_null) / mean_null
            else:
                b_null = 100
            if a_null < 0:
                a_null = 0.1
                b_null = a_null * (1 - mean_null) / mean_null
            rv_null = beta(a_null, b_null)
            # fill it in the dictionary
            rv_n_dict[channel] = rv_null
        return rv_t_dict, rv_n_dict

[docs]    def perform_p_greater(self, df_target, rv_t_dict, rv_n_dict, confidence_level, channels=0):
        '''
        description:
            calcualte p_greater for channels
            note that set p_greater to be 0.5 if the p_greater is not monotonically growing for the target samples


        USAGE: channels, p_greater_dict, list_p_greater, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level = perform_p_greater(df_target, rv_t_dict, rv_n_dict, confidence_level, channels=0)

        :param df_target: target samples in pandas frame
        :param rv_t_dict: a dictionary of beta distribution (scipy obj) for the target samples
        :param rv_n_dict: a dictionary of beta distribution (scipy obj) for the null samples
        :param confidence_level: confidence level
        :param channels: channels
        :return:
            channels: list of channels
            p_greater_dict: a dictionary of p_greater
            list_p_greater: a list of p_greater
            list_p_greater_passed: a lisf of p_greater where p_greater is kept if the value greater than confidence level, other wise 0
            list_p_greater_failed: a lisf of p_greater where p_greater is kept if the value less than confidence level, other wise 0
            list_Test: list of {pas, fail}
            confidence_level: confidence level
        '''
        # find a list of channels from the target samples
        if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples
            channels = self.find_channels(df_target)
        else:
            pass
        p_greater_dict = {}
        list_p_greater = np.array([])
        list_p_greater_passed = np.array([])
        list_p_greater_failed = np.array([])
        list_Test = np.array([])
        for channel in channels:
            # additional safety: target should be monotonically increase in p_belong
            dbin = 0.00005
            p_belong_safety = []
            T = df_target[channel].values
            for x in [T.min(), T.max()]:
                if x == 0:
                    x += 0.0001
                if x == 1:
                    x -= 0.0001
                    if T[T != 1].max() < 0.5 * x:  # if the x = 1 is very edge of the long tail, pro_fore_belong will be the same as pro_back_belong
                        x = T[T != 1].max() + 0.1
                pro_fore_belong = (rv_t_dict[channel].cdf(x + dbin) - rv_t_dict[channel].cdf(x)) * rv_n_dict[
                    channel].cdf(x)
                pro_back_belong = (rv_n_dict[channel].cdf(x + dbin) - rv_n_dict[channel].cdf(x)) * rv_t_dict[
                    channel].cdf(x)
                if pro_fore_belong == pro_back_belong:
                    p_belong = 0.5
                else:
                    p_belong = pro_fore_belong / float((pro_fore_belong + pro_back_belong))
                p_belong_safety.append(p_belong)
            if p_belong_safety[0] < p_belong_safety[1]:  # p_belong for the largest x should be greater than that of the minimum x
                # calculate prob of greater
                p_greater, error = integrate.quad(lambda x: rv_t_dict[channel].pdf(x) * rv_n_dict[channel].cdf(x), -0.1,
                                                  1.1)
            else:
                p_greater = 0.5
            p_greater_dict[channel] = p_greater
            list_p_greater = np.append(list_p_greater, p_greater)
            if p_greater > confidence_level: # one-sided Welch t-test
                list_p_greater_passed = np.append(list_p_greater_passed, p_greater)  # fill t-value if this channel passes the test
                list_p_greater_failed = np.append(list_p_greater_failed, 0) # fill zero
                list_Test = np.append(list_Test, 'pass')  # fill the mark of pass
            else:
                list_p_greater_passed = np.append(list_p_greater_passed, 0) # fill zero
                list_p_greater_failed = np.append(list_p_greater_failed, p_greater) # fill t-value if this channel fails the test
                list_Test = np.append(list_Test, 'fail')  # fill the mark of fail

        return channels, p_greater_dict, list_p_greater, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level




[docs]    def perform_p_belong(self, df_target, p_greater_dict, rv_t_dict, rv_n_dict, confidence_level, channels=0):
        '''
        description:
            1. calculate p_belong for each channel in each frequency in each glitch
            keep only channels whose p_greater is greater than 0.5
        USAGE: list_channels_passed, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level = perform_p_belong(df_target, p_greater_dict, rv_t_dict, rv_n_dict, confidence_level, channels=0)

        :param df_target: target samples in pandas frame
        :param p_greater_dict: null samples in pandas frame
        :param rv_t_dict: a dictionary of beta distribution (scipy obj) for the target samples
        :param rv_n_dict: a dictionary of beta distribution (scipy obj) for the null samples
        :param confidence_level: confidecel level
        :param channels: channels (optional)
        :return:
            list_channels_passed: a list of channels whose p_greater is above 0.5
            list_GPS: a list of GPS time of the target samples
            list_duration: list of durations
            mat_p_belong: a matrix of p_belong where glitch samples are in row and passed channels are in columns
            mat_Test: a matrix of {pass, fail} where
            confidence_level: confidence level used
        '''
        # find a list of channels from the target samples
        if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples
            channels = self.find_channels(df_target)
        else:
            pass
        # find the list of GPS times of the target samples
        list_GPS = df_target["GPS"].values
        # get the list of durations of the target samples
        list_duration = df_target["duration"].values
        mat_p_belong = np.array([])
        list_channels_passed = np.array([])
        for channel in channels:
            if p_greater_dict[channel] > 0.5:
                list_channels_passed = np.append(list_channels_passed, channel)
                # calculate p belong
                dbin = 0.00005
                list_p_belong = np.array([])
                T = df_target[channel].values
                for x in T:
                    if x == 0:
                        x += 0.0001
                    if x == 1:
                        x -= 0.0001
                        if T[T != 1].max() < 0.5*x: # if the x = 1 is very edge of the long tail, pro_fore_belong will be the same as pro_back_belong
                            x = T[T != 1].max() + 0.1
                    pro_fore_belong = (rv_t_dict[channel].cdf(x + dbin) - rv_t_dict[channel].cdf(x)) * rv_n_dict[channel].cdf(x)
                    pro_back_belong = (rv_n_dict[channel].cdf(x + dbin) - rv_n_dict[channel].cdf(x)) * rv_t_dict[channel].cdf(x)
                    if pro_fore_belong == pro_back_belong:
                        p_belong = 0.5
                    else:
                        p_belong = pro_fore_belong / float((pro_fore_belong + pro_back_belong))
                    list_p_belong = np.append(list_p_belong, p_belong)
                list_p_belong = list_p_belong.reshape(list_p_belong.shape[0], 1)
                mat_p_belong = np.hstack([mat_p_belong, list_p_belong]) if mat_p_belong.size else list_p_belong
        # find the indices where the faps are less than than 1 - confidence_level which rejects the null hypothesis
        index_passed = np.where(mat_p_belong > confidence_level)
        # make a matrix of "pass" or "fail"
        mat_Test = np.array((["fail"] * mat_p_belong.size)).reshape(mat_p_belong.shape[0], mat_p_belong.shape[-1])
        mat_Test[index_passed] = 'pass'
        return list_channels_passed, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level

[docs]    def ranking_channels(self, list_ranking_statistic, list_Test):
        '''
        description:
            1. sort based on the value of the ranking statistic
            2. sort based on the test with "pass" and "fail", where "pass" comes before "fail"
            3. findthe indecies based on the sort 1) and 2)
        USAGE: list_sorted_base_index_pass_fail, list_sorted_ranking_statistic_pass_fail, list_sorted_Test_pass_fail = ranking_channels(list_ranking_statistic, list_Test)

        :param list_ranking_statistic:
        :param list_Test:
        :return:
        '''
        list_original_indicies = np.arange(0, list_Test.shape[0], 1).astype('int')
        list_sorted_statistic_indices = np.argsort(list_ranking_statistic)[::-1] # descending order
        # sort the original indices
        list_sorted_original_indicies = list_original_indicies[list_sorted_statistic_indices]
        # sort the ranking statistic
        list_sorted_ranking_statistic = list_ranking_statistic[list_sorted_statistic_indices]
        # sort the list of Test
        list_sorted_Test = list_Test[list_sorted_statistic_indices]
        # find the indices where the pass comes first and the fail comes later
        list_pass_indices = np.where(list_sorted_Test == 'pass')[0]
        list_fail_indicies = np.where(list_sorted_Test == 'fail')[0]
        list_indices_sorted_pass_fail = np.append(list_pass_indices, list_fail_indicies)
        # sort the original indices
        list_sorted_base_index_pass_fail = list_sorted_original_indicies[list_indices_sorted_pass_fail]
        # sort the ranking statistic
        list_sorted_ranking_statistic_pass_fail = list_sorted_ranking_statistic[list_indices_sorted_pass_fail]
        # sort the list of Test
        list_sorted_Test_pass_fail = list_sorted_Test[list_indices_sorted_pass_fail]
        # assign rank
        # ...
        return list_sorted_base_index_pass_fail, list_sorted_ranking_statistic_pass_fail, list_sorted_Test_pass_fail





[docs]    def Plot_Welch_t_test(self, channels, list_t_values_passed, list_t_values_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands):
        '''
        description:
            plot the result of one-sided Welch t-test
        USAGE: Plot_Welch_t_test(channels, list_t_values_passed, list_t_values_failed, list_Test, output_dir, output_file)

        :param channels: a list of channels
        :param list_t_values_passed: a list of t-values that pass the test
        :param list_t_values_failed: a list of t-values that fail the test
        :param list_Test: a list of the test results {'pass', 'fail'}
        :param confidence_level: a confidence level
        :param output_dir: an output directory
        :param output_file: an output file name
        :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
        :return: None
        '''
        CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels)
        # color the background either with black or gray for each channel
        Listcolor = ['black', 'gray'] * 20 * 7
        for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
            if edgeRight == list_t_values_passed.shape[-1] - 1:  # fix the background color to match plot size
                edgeRight = edgeRight + 1
            ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
            ax.set_edgecolor(None)
        # ++++++++  plot bar plot about t-values that pass or fail Welch t-test ++++++++++++++++
        plt.bar(np.where(list_t_values_failed != 0)[0], list_t_values_failed[list_t_values_failed != 0], 1, align='edge', edgecolor=None, color='lightblue',alpha=0.3, linewidth=0.)
        plt.bar(np.where(list_t_values_passed != 0)[0], list_t_values_passed[list_t_values_passed != 0], 1, align='edge', edgecolor=None, color='blue', linewidth=0.)
        # +++++++++++ plot a table of ranking channels +++++++++++++++++++
        # a list of the probability of the causality
        list_t_values = list_t_values_passed + list_t_values_failed
        # list of the error of the probability of the causality
        # list_causal_err = list_causal_passed_err + list_causal_failed_err
        # sort channel based on the probability of the causality
        if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
            # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
            list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
            # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
            # A list of the corresponding frequency bands are created
            # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
            # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
            list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F')
            # get rid of the BD# at the end of the channel name to make it original name
            channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels])
            # sort the links based on values of statistic
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
            RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]','t-value', 'Test{}'.format(confidence_level)))
            # define the table size
            ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
        else:
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
            RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 't-value', 'Test{}'.format(confidence_level)))
            # defind the table size
            ColWidths = [0.05, 0.6, 0.1, 0.15]
        # add rank label
        Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1))
        # round importance
        Prodf['t-value'] = Prodf['t-value'].astype(float).round(3)  # = df.value1.round()
        # re-sort columns to make rank labels coming first
        cols = Prodf.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        Prodf = Prodf[cols]
        # take the first 5 high ranking channels
        dcsummary = Prodf.iloc[:5]
        # plot table on the top of the bar plot
        table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
        table.auto_set_font_size(False)
        table.set_fontsize(6)
        # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
        # ++++++++++++++  alter ticks +++++++++++++++++
        # plot xticks using the dominant sub sensor names
        plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
        # put ticks above
        plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
        plt.tick_params(axis="y", labelsize=10)
        plt.grid(False)
        plt.xlim(0, ListSubsys.shape[0])
        # set axis label
        plt.ylabel('t-value', fontsize=15)
        plt.xlabel('Auxiliary channel', fontsize=15)
        plt.tight_layout(rect=[0, 0.03, 1, 0.8])
        # save a plot
        output_path = os.path.join(output_dir, output_file)
        plt.savefig(output_path)
        plt.close()
        print('Completed Saving a plot of t-value VS channel')
        return None



[docs]    def Table_Welch_t_test(self, channels, list_t_values_passed, list_t_values_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands):
        '''
        description:
              save the result of one-sided Welch t-test as a .csv file
        USAGE: Table_Welch_t_test(channels, list_t_values_passed, list_t_values_failed, list_Test, output_dir, output_file)

        :param channels: a list of channels
        :param list_t_values_passed: a list of t-values that pass the test
        :param list_t_values_failed: a list of t-values that fail the test
        :param list_Test: a list of the test results {'pass', 'fail'}
        :param confidence_level: a confidence level
        :param output_dir: an output directory
        :param output_file: an output file name
        :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
        :return: None
        '''
        # a list of the t-values
        list_t_values = list_t_values_passed + list_t_values_failed
        if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
            # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
            list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
            # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
            # A list of the corresponding frequency bands are created
            # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
            # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
            list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F')
            # get rid of the BD# at the end of the channel name to make it original name
            channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels])
            # sort the links based on values of statistic
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
            RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]','t-value', 'Test{}'.format(confidence_level)))
            # define the table size
        else:
            # sort the links based on values of statistic
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
            RankChannel = list(zip(list_sorted_base_index_pass_fail, channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 't-value', 'Test{}'.format(confidence_level)))
        # add rank label
        Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1))
        # round importance
        Prodf['t-value'] = Prodf['t-value'].astype(float).round(3)  # = df.value1.round()
        # re-sort columns to make rank labels coming first
        cols = Prodf.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        Prodf = Prodf[cols]
        output_path = os.path.join(output_dir, output_file)
        # save it
        Prodf.to_csv(output_path, index=False)
        print('Completed saving a table of the t-value of channels')
        return None

    ################

[docs]    def perform_point_chisqr_test(self, df_target, df_null, confidence_level, channels=0):
        '''
        description:
            perform a single point chi-square test for each channel at each glitch
        USAGE: channels, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, p_values, confidence_level= perform_point_chisqr_test(df_target, df_null, confidence_level, channels)

        :param df_target: target samples in pandas frame
        :param df_null: null samples in pandas frame
        :param confidence_level: a confidence level
        :param channels: a list of channels, 0 in default
        :return
            channels: a list of channels in numpy array
            list_GPS: a list of GPS times in numpy array
            list_duration: a list of durations in numpy array
            mat_chsqr_passed: a matrix of "passed" chi-square values where glitch indices are in rows and channels are columns in numpy array
                note that channels in glitches that passed the test have non-zero values, otherwise zero
            mat_chsqr_failed: a matrix of "failed" chi-square values where glitch indices are in rows and channels are columns in numpy array
                note that channels in glitches that failed the test have non-zero values, otherwise zero
            mat_Test: a list of the test results {'pass', 'fail'}
            confidence_level: a value of user defined confidence level that is used for the test
            p_values: a matrix of p-values where glitch indices are in rows and channels are columns in numpy array
        '''
        # find the list of channels
        if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples
            channels = self.find_channels(df_target)
        else:
            pass
        # find the list of GPS times of the target samples
        list_GPS = df_target["GPS"].values
        # get the list of durations of the target samples
        list_duration = df_target["duration"].values
        # calculate values of chi-square and corresponding p-values
        chsqr_values, p_values = self.chisquare_test(df_target[channels].values, df_null[channels].values)
        # make a matrix with all the elements equal to zero where its shape is the same as the chisqr_values
        mat_chsqr_passed = np.zeros((chsqr_values.shape[0], chsqr_values.shape[-1]))
        # find the indices where the p_values are less than than 1 - confidence_level which rejects the null hypothesis
        index_passed = np.where(p_values < (1 - confidence_level))
        # replace the values of the elements with chi-square values where the elements rejects the null hypothesis
        # note the elements which failed to reject the null hypothesis is set to be zero
        mat_chsqr_passed[index_passed] = chsqr_values[index_passed]
        # make a matrix of chi-squares that fail to reject the null hypothesis
        # note it elements which reject the null hypothesis set to be zero
        mat_chsqr_failed = chsqr_values - mat_chsqr_passed
        # make a matrix of "pass" or "fail"
        mat_Test = np.array((["fail"] * chsqr_values.size)).reshape(chsqr_values.shape[0], chsqr_values.shape[-1])
        mat_Test[index_passed] = 'pass'
        return channels, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, p_values, confidence_level

[docs]    def Plot_point_chisqur_test(self, channels_band, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, p_values, confidence_level, output_dir, output_file=None, freq_bands=Const.freq_bands):
        '''
        description:
            plot the result of point chi-square test
        USAGE: Plot_point_chisqur_test(channels, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands)

        :param channels_band: a list of channels in numpy array
        :param list_GPS: a list of GPS times in numpy array
        :param list_duration: a list of durations in numpy array
        :param mat_chsqr_passed: a matrix of "passed" chi-square values where glitch indices are in rows and channels are columns in numpy array
                note that channels in glitches that passed the test have non-zero values, otherwise zero
        :param mat_chsqr_failed: a matrix of "failed" chi-square values where glitch indices are in rows and channels are columns in numpy array
                note that channels in glitches that failed the test have non-zero values, otherwise zero
        :param mat_Test: a list of the test results {'pass', 'fail'}
        :param p_values: a matrix of p-values where glitch indices are in rows and channels are columns in numpy array
        :param output_dir: an output directory
        :param output_file: an output file name
        :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
        :return: None
        '''
        for GPS, duration, list_chsqr_passed, list_chsqr_failed, list_Test in zip(list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test):
            CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels_band)
            # color the background either with black or gray for each channel
            Listcolor = ['black', 'gray'] * 20 * 7
            for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
                if edgeRight == list_chsqr_passed.shape[-1] - 1:  # fix the background color to match plot size
                    edgeRight = edgeRight + 1
                ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
                ax.set_edgecolor(None)
            # ++++++++  plot bar plot about t-values that pass or fail Welch t-test ++++++++++++++++
            plt.bar(np.where(list_chsqr_failed != 0)[0], list_chsqr_failed[list_chsqr_failed != 0], 1, align='center', edgecolor=None, color='lightblue',alpha=0.3, linewidth=0.)
            plt.bar(np.where(list_chsqr_passed != 0)[0], list_chsqr_passed[list_chsqr_passed != 0], 1, align='center', edgecolor=None, color='blue', linewidth=0.)
            # +++++++++++ plot a table of ranking channels +++++++++++++++++++
            # a list of values of chi-square
            list_chsqr = list_chsqr_passed + list_chsqr_failed
            # sort channel based on the probability of the causality
            if channels_band[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
                # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
                list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
                # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
                # A list of the corresponding frequency bands are created
                # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
                # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
                list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels_band.shape[0], axis=1).flatten(order='F')
                # get rid of the BD# at the end of the channel name to make it original name
                channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels_band])
                # sort the links based on values of statistic
                list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_chsqr, list_Test)
                RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_chsqr[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
                # make pandas matrix
                Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'chi-square', 'Test{}'.format(confidence_level)))
                # define the table size
                ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
            else:
                # sort the links based on values of statistic
                list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_chsqr, list_Test)
                RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_chsqr[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
                # make pandas matrix
                Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'chi-square', 'Test{}'.format(confidence_level)))
                # defind the table size
                ColWidths = [0.05, 0.6, 0.1, 0.15]
            # add rank label
            Prodf = Prodf.assign(rank=np.arange(1, list_chsqr.shape[-1] + 1, 1))
            # round importance
            Prodf['chi-square'] = Prodf['chi-square'].astype(float).round(3)  # = df.value1.round()
            # re-sort columns to make rank labels coming first
            cols = Prodf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            Prodf = Prodf[cols]
            # take the first 5 high ranking channels
            dcsummary = Prodf.iloc[:5]
            # plot table on the top of the bar plot
            table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
            table.auto_set_font_size(False)
            table.set_fontsize(6)
            # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
            # ++++++++++++++  alter ticks +++++++++++++++++
            # plot xticks using the dominant sub sensor names
            plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
            # put ticks above
            plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
            plt.tick_params(axis="y", labelsize=10)
            plt.grid(False)
            plt.xlim(0, ListSubsys.shape[0])
            # make title
            plt.text(channels.shape[0]*0.5, (list_chsqr_passed + list_chsqr_failed).max()*1.4, 'GPS = {GPS:.02f}, duration = {duration:.3f} s'.format(GPS=GPS, duration=duration), ha='center', va='center', fontsize=10)
            plt.ylabel('Chi-square', fontsize=15)
            plt.xlabel('Auxiliary channel', fontsize=15)
            plt.tight_layout(rect=[0, 0.03, 1, 1])
            # save a plot
            if output_file == None:
                output_path = os.path.join(output_dir, '{}_chisqr.pdf'.format(str(GPS).replace('.', '-')))
            else:
                output_path = os.path.join(output_dir, output_file)
            plt.savefig(output_path)
            plt.close()
            print('Completed Saving a plot of chi-square VS channel of a glitch at {}'.format(GPS))
        #------------------+
        # save the table   |
        #------------------+
        # for chi-squares
        mat_chsqr = mat_chsqr_passed + mat_chsqr_failed
        data_GPSduration = np.hstack((list_GPS.reshape(list_GPS.shape[0], 1), list_duration.reshape(list_duration.shape[0], 1)))
        data_chisqr = np.hstack((data_GPSduration, mat_chsqr))
        columns_chisqr = np.append(np.array(["GPS", "duration"]), channels_band)
        df_chisqr = pd.DataFrame(data_chisqr, columns=columns_chisqr)
        df_chisqr.to_csv(os.path.join(output_dir, 'chisqr.csv'))
        print("chi-square table is saved")
        # for p-values
        data_pvalue = np.hstack((data_GPSduration, p_values))
        df_chisqr = pd.DataFrame(data_pvalue, columns=columns_chisqr)
        df_chisqr.to_csv(os.path.join(output_dir, 'p_values.csv'))
        print("p-value table is saved")
        return None


[docs]    def perform_fap(self, df_target, df_null, confidence_level, channels=0):
        '''
        description:
            calculate a FAP for each channel at each glitch
        USAGE: channels, list_GPS, list_duration, mat_fap, mat_Test, confidence_level= perform_fap(df_target, df_null, confidence_level, channels)

        :param df_target: target samples in pandas frame
        :param df_null: null samples in pandas frame
        :param confidence_level: a confidence level
        :param channels: a list of channels, 0 in default
        :return
            channels: a list of channels in numpy array
            list_GPS: a list of GPS times in numpy array
            list_duration: a list of durations in numpy array
            mat_fap: matrix of fap
            mat_Test: a list of the test results {'pass', 'fail'}
            confidence_level: a value of user defined confidence level that is used for the test
        '''
        # find the list of channels
        if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples
            channels = self.find_channels(df_target)
        else:
            pass
        # find the list of GPS times of the target samples
        list_GPS = df_target["GPS"].values
        # get the list of durations of the target samples
        list_duration = df_target["duration"].values
        # calculate values of FAP
        mat_fap = self.calculate_fap(df_target[channels].values, df_null[channels].values)
        # find the indices where the faps are less than than 1 - confidence_level which rejects the null hypothesis
        index_passed = np.where(mat_fap < (1 - confidence_level))
        # make a matrix of "pass" or "fail"
        mat_Test = np.array((["fail"] * mat_fap.size)).reshape(mat_fap.shape[0], mat_fap.shape[-1])
        mat_Test[index_passed] = 'pass'
        return channels, list_GPS, list_duration, mat_fap, mat_Test, confidence_level



[docs]    def Plot_fap(self, channels_band, list_GPS, list_duration, mat_fap, mat_Test, confidence_level, output_dir, output_file=None, freq_bands=Const.freq_bands):
        '''
        description:
            plot the result of point chi-square test
        USAGE: Plot_point_chisqur_test(channels, list_GPS, list_duration, mat_fap, mat_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands)

        :param channels_band: a list of channels in numpy array
        :param list_GPS: a list of GPS times in numpy array
        :param list_duration: a list of durations in numpy array
        :param mat_fap: matrix of fap
        :param mat_Test: a list of the test results {'pass', 'fail'}
        :param output_dir: an output directory
        :param output_file: an output file name
        :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
        :return: None
        '''
        for GPS, duration, list_fap, list_Test in zip(list_GPS, list_duration, mat_fap, mat_Test):
            CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels_band)
            # color the background either with black or gray for each channel
            Listcolor = ['black', 'gray'] * 20 * 7
            for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
                if edgeRight == list_fap.shape[-1] - 1:  # fix the background color to match plot size
                    edgeRight = edgeRight + 1
                ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
                ax.set_edgecolor(None)
            # ++++++++  plot bar plot about - log10(fap) ++++++++++++++++
            list_fap_for_plot = list_fap
            # for the plottting perpose, 1 / fap when fap = 0 will be infinit so that I replace the fap = 0 to be the maximum fap in the mat_fap
            list_fap_for_plot[list_fap_for_plot == 0] = mat_fap[mat_fap != 0].min()
            plt.bar(np.where(list_fap_for_plot >= (1 - confidence_level))[0], 1 / list_fap_for_plot[list_fap_for_plot >= (1 - confidence_level)], 1, align='center', edgecolor=None, color='lightblue', alpha=0.3, linewidth=0.)
            plt.bar(np.where(list_fap_for_plot < (1 - confidence_level))[0], 1 / list_fap_for_plot[list_fap_for_plot < (1 - confidence_level)], 1, align='center', edgecolor=None, color='blue', linewidth=0.)
            # +++++++++++ plot a table of ranking channels +++++++++++++++++++
            # # a list of values of fap
            # list_fap = faps
            # sort channel based on the probability of the causality
            if channels_band[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
                # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
                list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
                # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
                # A list of the corresponding frequency bands are created
                # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
                # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
                list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels_band.shape[0], axis=1).flatten(order='F')
                # get rid of the BD# at the end of the channel name to make it original name
                channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels_band])
                # sort the links based on values of statistic
                list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(-np.log10(list_fap), list_Test)
                RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_fap[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
                # make pandas matrix
                Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'FAP', 'Test{}'.format(confidence_level)))
                # define the table size
                ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
            else:
                # sort the links based on values of statistic
                list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(-np.log10(list_fap), list_Test)
                RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_fap[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
                # make pandas matrix
                Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'FAP', 'Test{}'.format(confidence_level)))
                # defind the table size
                ColWidths = [0.05, 0.6, 0.1, 0.15]
            # add rank label
            Prodf = Prodf.assign(rank=np.arange(1, list_fap.shape[-1] + 1, 1))
            # round importance
            Prodf['FAP'] = ['{:.2e}'.format(fap) for fap in Prodf['FAP'].astype(float)]
            # re-sort columns to make rank labels coming first
            cols = Prodf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            Prodf = Prodf[cols]
            # take the first 5 high ranking channels
            dcsummary = Prodf.iloc[:5]
            # plot table on the top of the bar plot
            table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
            table.auto_set_font_size(False)
            table.set_fontsize(6)
            # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
            # ++++++++++++++  alter ticks +++++++++++++++++
            # plot xticks using the dominant sub sensor names
            plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
            # put ticks above
            plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
            plt.tick_params(axis="y", labelsize=10)
            plt.grid(False)
            plt.xlim(0, ListSubsys.shape[0])
            # make title
            plt.text(channels.shape[0]*0.5, (1/list_fap_for_plot).max()**1.4, 'GPS = {GPS:.02f}, duration = {duration:.3f} s'.format(GPS=GPS, duration=duration), ha='center', va='center', fontsize=10)
            plt.ylim(ymin=1)
            plt.ylabel('1/FAP', fontsize=15)
            plt.xlabel('Auxiliary channel', fontsize=15)
            plt.yscale('log')
            #plt.tight_layout(rect=[0, 0.03, 1, 0.8])
            plt.tight_layout(rect=[0, 0.03, 1, 1])
            # save a plot
            if output_file == None:
                output_path = os.path.join(output_dir, '{}_fap.pdf'.format(str(GPS).replace('.', '-')))
            else:
                output_path = os.path.join(output_dir, output_file)
            plt.savefig(output_path)
            plt.close()
            print('Completed Saving a plot of FAP VS channel of a glitch at {}'.format(GPS))
        return None

[docs]    def Save_fap_csv(self, channels, list_GPS, list_duration, mat_fap, output_dir):
        '''
        description:
            save the FAP table as csv file
        USAGE: Save_fap_csv(channels, list_GPS, list_duration, mat_fap, output_dir)

        :param channels: a list of channels
        :param list_GPS: a list of GPS times
        :param list_duration: a list of durations
        :param mat_fap: a matrix of FAP of each channel for each glitch
        :param output_dir: output directory
        :return: None
        '''
        #------------------+
        # save the table   |
        #------------------+
        # for fap
        data_GPSduration = np.hstack((list_GPS.reshape(list_GPS.shape[0], 1), list_duration.reshape(list_duration.shape[0], 1)))
        data_fap = np.hstack((data_GPSduration, mat_fap))
        columns_fap = np.append(np.array(["GPS", "duration"]), channels)
        df_fap = pd.DataFrame(data_fap, columns=columns_fap)
        df_fap.to_csv(os.path.join(output_dir, 'fap.csv'))
        print("FAP table is saved")
        return None

[docs]    def Plot_p_greater(self, channels, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands):
        '''
        description:
            plot the result of p_greater
        USAGE: Plot_p_greater(channels, list_p_greater_passed, list_p_greater_failed, list_Test, output_dir, output_file)

        :param channels: a list of channels
        :param list_p_greater_passed: a list of p_greater above confidence level
        :param list_p_greater_failed: a list of p_greater that fail the test
        :param list_Test: a list of the test results {'pass', 'fail'}
        :param confidence_level: a confidence level
        :param output_dir: an output directory
        :param output_file: an output file name
        :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
        :return: None
        '''
        CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels)
        # color the background either with black or gray for each channel
        Listcolor = ['black', 'gray'] * 20 * 7
        for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
            if edgeRight == list_p_greater_passed.shape[-1] - 1:  # fix the background color to match plot size
                edgeRight = edgeRight + 1
            ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
            ax.set_edgecolor(None)
        # ++++++++  plot bar plot about t-values that pass or fail Welch t-test ++++++++++++++++
        plt.bar(np.where(list_p_greater_failed != 0)[0], list_p_greater_failed[list_p_greater_failed != 0], 1, align='edge', edgecolor=None, color='lightblue',alpha=0.3, linewidth=0.)
        plt.bar(np.where(list_p_greater_passed != 0)[0], list_p_greater_passed[list_p_greater_passed != 0], 1, align='edge', edgecolor=None, color='red', linewidth=0.)
        # +++++++++++ plot a table of ranking channels +++++++++++++++++++
        # a list of the probability of the causality
        list_t_values = list_p_greater_passed + list_p_greater_failed
        # list of the error of the probability of the causality
        # list_causal_err = list_causal_passed_err + list_causal_failed_err
        # sort channel based on the probability of the causality
        # name of the probablity
        name_p_greater = r'$p_g$'
        if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
            # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
            list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
            # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
            # A list of the corresponding frequency bands are created
            # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
            # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
            list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F')
            # get rid of the BD# at the end of the channel name to make it original name
            channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels])
            # sort the links based on values of statistic
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
            RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', name_p_greater, 'Threshold {}'.format(confidence_level)))
            # define the table size
            ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
        else:
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
            RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', name_p_greater, 'Threshold {}'.format(confidence_level)))
            # defind the table size
            ColWidths = [0.05, 0.6, 0.1, 0.15]
        # add rank label
        Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1))
        # round importance
        Prodf[name_p_greater] = Prodf[name_p_greater].astype(float).round(5)  # = df.value1.round()
        #Prodf['p_greater'] = Prodf['p_greater'].astype(float).round(5)  # = df.value1.round()
        # re-sort columns to make rank labels coming first
        cols = Prodf.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        Prodf = Prodf[cols]
        # take the first 5 high ranking channels
        dcsummary = Prodf.iloc[:5]
        # plot table on the top of the bar plot
        table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
        table.auto_set_font_size(False)
        table.set_fontsize(6)
        # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
        # ++++++++++++++  alter ticks +++++++++++++++++
        # plot xticks using the dominant sub sensor names
        plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
        # put ticks above
        plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
        plt.tick_params(axis="y", labelsize=10)
        plt.grid(False)
        plt.xlim(0, ListSubsys.shape[0])
        # set axis label
        #plt.ylabel(r'p_greater', fontsize=15)
        plt.ylabel(r'$p_g$', fontsize=15)
        plt.xlabel('Auxiliary channel', fontsize=15)
        plt.tight_layout(rect=[0, 0.03, 1, 1])
        # save a plot
        output_path = os.path.join(output_dir, output_file)
        plt.savefig(output_path)
        plt.close()
        print('Completed Saving a plot of p_greater VS channel')
        return None


[docs]    def Table_p_greater(self, channels, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands):
        '''
        description:
              save the result of p_greater as a .csv file
        USAGE: Table_Welch_t_test(channels, list_p_greater_passed, list_p_greater_failed, list_Test, output_dir, output_file)

        :param channels: a list of channels
        :param list_p_greater_passed: a list of p_greater that pass the test
        :param list_p_greater_failed: a list of p_greater that fail the test
        :param list_Test: a list of the test results {'pass', 'fail'}
        :param confidence_level: a confidence level
        :param output_dir: an output directory
        :param output_file: an output file name
        :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
        :return: None
        '''
        # a list of the t-values
        list_t_values = list_p_greater_passed + list_p_greater_failed
        if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
            # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
            list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
            # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
            # A list of the corresponding frequency bands are created
            # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
            # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
            list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F')
            # get rid of the BD# at the end of the channel name to make it original name
            channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels])
            # sort the links based on values of statistic
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
            RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'p_greater', 'Test{}'.format(confidence_level)))
            # define the table size
        else:
            # sort the links based on values of statistic
            list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
            RankChannel = list(zip(list_sorted_base_index_pass_fail, channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
            # make pandas matrix
            Prodf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 'p_greater', 'Test{}'.format(confidence_level)))
        # add rank label
        Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1))
        # round importance
        Prodf['p_greater'] = Prodf['p_greater'].astype(float).round(6)  # = df.value1.round()
        # re-sort columns to make rank labels coming first
        cols = Prodf.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        Prodf = Prodf[cols]
        output_path = os.path.join(output_dir, output_file)
        # save it
        Prodf.to_csv(output_path, index=False)
        print('Completed saving a table of the p_greater of channels')
        return None

[docs]    def Save_p_belong_csv(self, channels_passed, list_GPS, list_duration, mat_p_belong, output_dir):
        '''
        description:
            save the p_belong table as csv file
            channels_passed is the list of channels whose p_greater is above 0.5 to avoid misinterpretation of p_belong.
        USAGE: Save_p_belong_csv(channels_passed, list_GPS, list_duration, mat_p_belong, output_dir)

        :param channels_passed: a list of channels
        :param list_GPS: a list of GPS times
        :param list_duration: a list of durations
        :param mat_p_belong: a matrix of p_belong of each channel for each glitch
        :param output_dir: output directory
        :return: None
        '''
        #------------------+
        # save the table   |
        #------------------+
        # for fap
        data_GPSduration = np.hstack((list_GPS.reshape(list_GPS.shape[0], 1), list_duration.reshape(list_duration.shape[0], 1)))
        data_p_belong = np.hstack((data_GPSduration, mat_p_belong))
        columns_p_belong = np.append(np.array(["GPS", "duration"]), channels_passed)
        df_p_belong = pd.DataFrame(data_p_belong, columns=columns_p_belong)
        df_p_belong.to_csv(os.path.join(output_dir, 'p_belong.csv'))
        print("p_belong table is saved")
        return None

[docs]    def Plot_p_belong(self, channels_band, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level, output_dir, output_file=None, freq_bands=Const.freq_bands):
        '''
        description:
            plot the result of p_belong
        USAGE: Plot_p_belong(channels, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands)

        :param channels: a list of channels in numpy array
        :param list_GPS: a list of GPS times in numpy array
        :param list_duration: a list of durations in numpy array
        :param mat_p_belong: matrix of p_belong
        :param mat_Test: a list of the test results {'pass', 'fail'}
        :param output_dir: an output directory
        :param output_file: an output file name
        :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
        :return: None
        '''
        for GPS, duration, list_p_belong, list_Test in zip(list_GPS, list_duration, mat_p_belong, mat_Test):
            CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels_band)
            # color the background either with black or gray for each channel
            Listcolor = ['black', 'gray'] * 20 * 7
            for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
                if edgeRight == list_p_belong.shape[-1] - 1:  # fix the background color to match plot size
                    edgeRight = edgeRight + 1
                ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
                ax.set_edgecolor(None)
            # for the plottting perpose, 1 / fap when fap = 0 will be infinit so that I replace the fap = 0 to be the maximum fap in the mat_fap
            plt.bar(np.where(list_p_belong <= confidence_level)[0], list_p_belong[list_p_belong <= confidence_level], 1, align='center', edgecolor=None, color='lightblue', alpha=0.3, linewidth=0.)
            plt.bar(np.where(list_p_belong > confidence_level)[0], list_p_belong[list_p_belong > confidence_level], 1, align='center', edgecolor=None, color='blue', linewidth=0.)
            # +++++++++++ plot a table of ranking channels +++++++++++++++++++
            # # a list of values of fap
            # list_fap = faps
            # sort channel based on the probability of the causality
            if channels_band[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
                # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
                list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
                # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
                # A list of the corresponding frequency bands are created
                # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
                # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
                #list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels_band.shape[0], axis=1).flatten(order='F')
                # Because for plotting p_belong, only channels that passed p_greater are saved in the matrix
                list_freq_bands = np.array([list_freq_bands[int(ch.split('BP')[-1])] for ch in channels_band])
                # get rid of the BD# at the end of the channel name to make it original name
                channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels_band])
                # sort the links based on values of statistic
                list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_p_belong, list_Test)
                RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_p_belong[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
                # make pandas matrix
                Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'p_belong', 'Test{}'.format(confidence_level)))
                # define the table size
                ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
            else:
                # sort the links based on values of statistic
                list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_p_belong, list_Test)
                RankChannel = list(zip(channels_band[list_sorted_base_index_pass_fail], list_p_belong[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
                # make pandas matrix
                Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'p_belong', 'Test{}'.format(confidence_level)))
                # defind the table size
                ColWidths = [0.05, 0.6, 0.1, 0.15]
            # add rank label
            Prodf = Prodf.assign(rank=np.arange(1, list_p_belong.shape[-1] + 1, 1))
            # round importance
            Prodf['p_belong'] = ['{:.4f}'.format(fap) for fap in Prodf['p_belong'].astype(float)]
            # re-sort columns to make rank labels coming first
            cols = Prodf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            Prodf = Prodf[cols]
            # take the first 5 high ranking channels
            dcsummary = Prodf.iloc[:5]
            # plot table on the top of the bar plot
            table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
            table.auto_set_font_size(False)
            table.set_fontsize(6)
            # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
            # ++++++++++++++  alter ticks +++++++++++++++++
            # plot xticks using the dominant sub sensor names
            plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
            # put ticks above
            plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
            plt.tick_params(axis="y", labelsize=10)
            plt.grid(False)
            plt.xlim(0, ListSubsys.shape[0])
            # make title
            plt.text(channels.shape[0]*0.5, list_p_belong.max()*1.4, 'GPS = {GPS:.02f}, duration = {duration:.3f} s'.format(GPS=GPS, duration=duration), ha='center', va='center', fontsize=10)
            plt.ylabel('p_belong', fontsize=15)
            plt.xlabel('Auxiliary channel', fontsize=15)
            #plt.tight_layout(rect=[0, 0.03, 1, 0.8])
            plt.tight_layout(rect=[0, 0.03, 1, 1])
            # save a plot
            if output_file == None:
                output_path = os.path.join(output_dir, '{}_p_belong.pdf'.format(str(GPS).replace('.', '-')))
            else:
                output_path = os.path.join(output_dir, output_file)
            plt.savefig(output_path)
            plt.close()
            print('Completed Saving a plot of p_belong VS channel of a glitch at {}'.format(GPS))
        return None



    ###################


    # def Determine_number_of_subclass(self, Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence):
    #     '''
    #     description:
    #         1. query the clustered target samples
    #         2. query null samples
    #         3. perform one-sided binomial test and one-sided Welch t-test on each subclass
    #         4. count the number of subclasses which has at least one channel passing the both tests
    #     USAGE: num_subclass = Determine_number_of_subclass(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence)
    #
    #     :param Path_Target_Glitch_SubClassClustered_Dataset: a path to the clustered target samples
    #     :param Path_Null_Dataset: a path to null samples
    #     :param test_confidence: a statistical confidence level
    #     :return: num_subclass: the number of subclasses which has at least one channel passing the both tests
    #     '''
    #     num_subclass = 0 # set it zero
    #     # query the clustered target samples and null samples
    #     df_target, df_null = self.query_targetglitch_null(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset)
    #     # get a list of subclass labels (non-duplicated)
    #     SubClass_labels = df_target.SubclassLabel.unique().astype('int')
    #     for Sub_class_Label in SubClass_labels:  # iterate though subclasses
    #         df_target_sub = df_target[df_target.SubclassLabel == Sub_class_Label].drop(['SubclassLabel'], axis=1)
    #         # binomial test
    #         list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_binomi_Test, list_channels = self.find_meaing_ful_confidence(df_target_sub, df_null, test_confidence)
    #         # Welch t-test
    #         channels, list_t_values_passed, list_t_values_failed, list_t_Test, confidence_level_used = self.perform_Welch_test(df_target_sub, df_null, test_confidence)
    #         if list_binomi_Test[list_binomi_Test == 'pass'].shape[0] > 0 and list_t_Test[list_t_Test == 'pass'].shape[0] > 0: # if there is a least one channel that pass the both tests
    #             num_subclass += 1 # count 1
    #     return num_subclass


[docs]    def Determine_number_of_subclass(self, Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence):
        '''
        description:
            1. query the clustered target samples
            2. query null samples
            3. perform one-sided binomial test and one-sided Welch t-test on each subclass
            4. count the number of subclasses which has at least one channel passing the both tests
        USAGE: num_subclass = Determine_number_of_subclass(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence)

        :param Path_Target_Glitch_SubClassClustered_Dataset: a path to the clustered target samples
        :param Path_Null_Dataset: a path to null samples
        :param test_confidence: a statistical confidence level
        :return: num_subclass: the number of subclasses which has at least one channel passing the both tests
        '''
        num_subclass = 0 # set it zero
        # query the clustered target samples and null samples
        df_target, df_null = self.query_targetglitch_null(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset)
        # estimate beta distributions
        rv_t_dict, rv_n_dict = self.perform_beta_dist(df_target, df_null)
        # get a list of subclass labels (non-duplicated)
        SubClass_labels = df_target.SubclassLabel.unique().astype('int')
        for Sub_class_Label in SubClass_labels:  # iterate though subclasses
            df_target_sub = df_target[df_target.SubclassLabel == Sub_class_Label].drop(['SubclassLabel'], axis=1)
            channels, p_greater_dict, list_p_greater, list_p_greater_passed, list_p_greater_failed, list_p_greater_Test, confidence_level_used = self.perform_p_greater(df_target_sub, rv_t_dict, rv_n_dict, test_confidence)
            if list_p_greater_Test[list_p_greater_Test == 'pass'].shape[0] > 0: # if there is a least one channel that pass the both tests
                num_subclass += 1 # count 1
            # binomial test
            # list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_binomi_Test, list_channels = self.find_meaing_ful_confidence(df_target_sub, df_null, test_confidence)
            # Welch t-test
            # channels, list_t_values_passed, list_t_values_failed, list_t_Test, confidence_level_used = self.perform_Welch_test(df_target_sub, df_null, test_confidence)

            #if list_binomi_Test[list_binomi_Test == 'pass'].shape[0] > 0 and list_t_Test[list_t_Test == 'pass'].shape[0] > 0: # if there is a least one channel that pass the both tests
            #    num_subclass += 1 # count 1
        return num_subclass


[docs]    def Get_MatCountGPSDuration(self, Path_Target_Glitch_SubClassClustered_Dataset):
        '''
        description:
            get a importance matrix, a list of channels, a list of GPS times, and a list of durations from the clustered target samples

        :param Path_Target_Glitch_SubClassClustered_Dataset:  a path to the clustered target samples
        :return:
            MatCount: a importance matrix
            ListChannelName: a list of channels
            ListGPS: a list of GPS times
            ListDuration: a list of durations
        '''
        df = pd.read_csv(Path_Target_Glitch_SubClassClustered_Dataset)
        try:
            ListChannelName = np.array(df.drop(['Unnamed: 0', 'SubclassLabel', 'GPS', 'duration'], axis=1).columns.tolist())
        except ValueError:
            ListChannelName = np.array(df.drop(['Unnamed: 0', 'SubclassLabel', 'GPS'], axis=1).columns.tolist())

        ListGPS = df['GPS'].values
        ListDuration = df['duration'].values
        MatCount = df.drop(['Unnamed: 0', 'SubclassLabel', 'GPS', 'duration'], axis=1).values
        return MatCount, ListChannelName, ListGPS, ListDuration



[docs]    def Principal_component_analysis(self, MatCount, frac_component=0.9):
        '''
        description:
            PCA decomposition applided to feature matrix
        USAGE: MatCount, MatCount_inverse_pca = Principal_component_analysis(self, MatCount, frac_component=0.9)

        :param MatCount: a feature matrix of glitches with samples in rows and features in columns
        :param frac_component: a cumulative variance
        :return:
            MatCount_pca: a feature matix in PCA space
            MatCount_inverse_pca: a reconstructed feature matrix in the original space
        '''
        pca = PCA()
        pca.fit(MatCount)
        cum_variance = np.cumsum(pca.explained_variance_ratio_)
        value_cut = cum_variance[cum_variance > frac_component][0]
        index_cut = np.where(cum_variance== value_cut)[0][0]

        pca = PCA(n_components=index_cut)
        MatCount_pca = pca.fit_transform(MatCount)
        MatCount_inverse_pca = pca.inverse_transform(MatCount_pca[:, :index_cut])

        # plt.imshow(MatCount, aspect='auto')
        # plt.tight_layout()
        # plt.savefig('orginal.png')
        # plt.close()
        #
        # plt.imshow(MatCount_pca, aspect='auto')
        # plt.savefig('pca.png')
        # plt.close()
        #
        # plt.imshow(MatCount_inverse_pca, aspect='auto',)
        # plt.savefig('inverse_pca.png')
        # plt.close()
        return MatCount_pca, MatCount_inverse_pca


[docs]    def FindSubClass(self, MatCount, ListChannelName, ListGPS, ListDuration, output_dir, upper_number_cluster, applied_Transformation):
        '''
        description:
            1. use a clustering approach
            2. plot glitch index VS channel grouped by clusters
            3. make a table comprising a list of GPS times a given cluster
            4. plot Importance VS channel of a given sub-class
            5. make a corresponding table
        USAGE: FindSubClass(MatCount, ListChannelName, ListGPS, output_dir)

        :param MatCount: a matrix comprising importance versus channels
        :param ListChannelName:  a list of channel names
        :param ListGPS: a list of GPS times
        :param ListDuration: a list of durations
        :param output_dir:
        :param upper_number_cluster: a value of upper limit of the number of clusters
        :param applied_Transformation: decomposition applided {'PCA', 'kernelPCA', 'None'}
        :return: None
        '''


        if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
            print('There is only one single glitch. Skip finding sub-classes')
            return None
        # limit the upper_number_cluster as it should be below a number of the sample
        if MatCount.shape[0] < upper_number_cluster:
            upper_number_cluster = MatCount.shape[0]
        print('Finding sub-classes by Gaussian mixture clustering')
        #-------------------------------------------------+
        # use Gaussian mixture model to get sub-classes   |
        #-------------------------------------------------+
        # PCA or kernel PCA applied or not
        if applied_Transformation == 'PCA':
            MatCount_pca, MatCount_inverse_pca = self.Principal_component_analysis(MatCount)
            X = MatCount_pca
        elif applied_Transformation == 'None':
            X = MatCount
        lowest_bic = np.infty
        bic = []
        n_components_range = range(1, upper_number_cluster+1)
        cv_types = ['spherical', 'tied', 'diag', 'full']
        for cv_type in cv_types:
            for n_components in n_components_range:
                # Fit a Gaussian mixture with EM
                gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type)
                gmm.fit(X)
                bic.append(gmm.bic(X))
                if bic[-1] < lowest_bic:
                    lowest_bic = bic[-1]
                    best_gmm = gmm
        clf = best_gmm
        Y_ = clf.predict(X) # label of the clusters
        ListUniqueLable = np.unique(Y_) # unique labels
        DictMatCount = {} # a dictionary having matrices for a given cluster label
        for UniqueLabel in ListUniqueLable:
            indecies = np.where(Y_ == UniqueLabel)[0]
            ClusteredMatCount = MatCount[indecies]
            DictMatCount[str(UniqueLabel)] = ClusteredMatCount
        print('{} sub-classes have been found'.format(len(DictMatCount)))
        #----------------------------------------------------------+
        # plot of occurrence VS channel grouped by sub-classes     |
        #----------------------------------------------------------+
        # color the background either with blakc or gray for each channel
        Listcolor = ['black', 'gray'] * 20 * 7
        # get xticks for dominant channels
        CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannelName)
        fig = plt.figure()
        # total grid of sub-plots
        firstratio = int(0.1 * MatCount.shape[0])
        # if the number of sample is less than 10, firstratio becomes 0. To cope with this issue, I manually replace it with 1
        if firstratio == 0:
            firstratio = 1
        totalrow = MatCount.shape[0] + firstratio
        # make grid for subplots
        gs = GridSpec(totalrow, 20)
        # a subplot showing channel position
        ax0 = fig.add_subplot(gs[:firstratio, :-1])  # First row, first column
        for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
            if edgeRight == MatCount.shape[-1] - 1: # adjust the rightest color span
                edgeRight = edgeRight + 1
            ax0.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
        # limit the xaxis of the first subplot
        ax0.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001) - 1 + 0.5)
        # get rid of ticks of the first plot
        ax0.yaxis.set_major_locator(plt.NullLocator())
        ax0.xaxis.set_major_locator(plt.NullLocator())
        ax0.set_xticks(CenterTicks)
        ax0.set_xticklabels(ListSubsys[ListInd.astype(int)])
        ax0.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-20, length=0, labelrotation=90, size=0, labelsize=10)
        ax0.grid(False)
        StartRow = firstratio # first row number of a sub-plot
        SubClassInd = 0
        for SubClassLabel, SubMatCount in sorted(DictMatCount.items()): # iterate through sub-classes
            NumRow = SubMatCount.shape[0] # the number of row for this sub-class
            EndRow = StartRow + NumRow # the ending row number
            # assign a size for the second subplot
            ax = fig.add_subplot(gs[StartRow:EndRow, :-1])
            ax.imshow(SubMatCount, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper', vmin=MatCount.min(), vmax=MatCount.max())
            ax.yaxis.set_label_position("right")
            # adjust y-ticks size
            ax.tick_params(axis="y", labelsize=10)
            ax.axis(xmin=-0.5, xmax=round(SubMatCount.shape[-1] * 1.001)-1+0.5)
            #ax.set_ylabel('sub{}'.format(SubClassLabel), fontsize=10, rotation=0)
            ax.text(SubMatCount.shape[-1]*1.001, SubMatCount.shape[0]/2., 'sub{}'.format(SubClassLabel), ha='left', va='bottom', rotation=90., fontsize=10)
            if SubMatCount.shape[0] == 1:
                ax.set_yticks([0])
            else:
                ax.yaxis.set_major_locator(MaxNLocator(integer=True))
            ax.set_xticks([])  # delete ticks
            # get rig of grid
            ax.grid(False)
            SubClassInd += 1 # update for subclass index
            StartRow = EndRow # update for a next one

        fig.subplots_adjust(hspace=int(0.005*totalrow))
        fig.subplots_adjust(wspace=int(3))
        ax.set_xlabel('Auxiliary channel', fontsize=15)
        fig.text(0.05, 0.5, 'Glitch index', ha='center', va='center', rotation=90., fontsize=15)
        ###### Make a color bar
        cmap = matplotlib.cm.viridis
        norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max())
        ax3 = fig.add_subplot(gs[1:, -1])
        cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
        ax3.grid(False)
        cbar.set_label('Importance', size=10)
        ########
        output_path = os.path.join(output_dir, 'GlitchIndexVSChannelClustered.pdf')
        fig.savefig(output_path)
        plt.close()
        #-------------------------------------------------------------------------------+
        # make .csv file comprising SubClassLabel, GPS, importance of all the channels  |
        #-------------------------------------------------------------------------------+
        # matrix of SubClass Label and corresponding GPS times
        MatSubClassLabelGPSDuration = np.concatenate((Y_.reshape(Y_.shape[0], 1), ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1)), axis=1)
        # concatenate subclass labels and a list of GPS times and importance of all the channels
        MatSubClassLabelGPSDurationImportanceChannels = np.concatenate((MatSubClassLabelGPSDuration, MatCount), axis=1)
        # make a list of column names
        columnName = np.append(np.array(['SubclassLabel', 'GPS', 'duration']), ListChannelName)
        # convert it to a pandas frame
        dfGPSDurationImportanceChannels = pd.DataFrame(MatSubClassLabelGPSDurationImportanceChannels, columns=columnName)
        path_output_fileGPSImportanceChannels = os.path.join(output_dir, 'ClusteredGPSDurationImportanceChannels.csv')
        # save it as .csv file
        dfGPSDurationImportanceChannels.to_csv(path_output_fileGPSImportanceChannels)
        print('Completed saving a table comprising a sub-class labels, GPS times, and importance of the channels')
        #--------------------------------------------+
        # plot each sub-class and make a table it    |
        #--------------------------------------------+
        for SubClassLabel, SubMatCount in sorted(DictMatCount.items()):  # iterate through sub-classes
            Plotoutput_file = 'SubClass{}.pdf'.format(SubClassLabel)
            Tableoutput_file = 'SubClass{}.csv'.format(SubClassLabel)
            # plot
            self.PlotImportanceVSChannel(SubMatCount, ListChannelName, output_dir, Plotoutput_file)
            # making a table
            self.TableImportance(SubMatCount, ListChannelName, output_dir, Tableoutput_file)
            print('Completed plotting and making a table of sub-class {}'.format(SubClassLabel))
        print('Sub-class analysis is done')
        return None

[docs]    def AutoDetermineTrendBin(self, SegmentStart, SegmentEnd):
        '''
        description: automatically determine the bins of the subclass trend plot
        USAGE: trend = AutoDetermineTrendBin(SegmentStart, SegmentEnd)

        :param SegmentStart: start time of a segment
        :param SegmentEnd: end time of a segment
        :return: trend:
            {'mins', 'hours', 'days', 'month'}
        '''
        length = SegmentEnd - SegmentStart # segment length in second
        if length < 3 * 60 * 60: # if the length is shorter than 3 hours
            trend = 'mins'
        elif length >=  3 * 60 * 60 and length < 3 * 24 * 60 * 60: # if the length is longer than 3 hours and shorter than 3 days
            trend = 'hours'
        elif length >= 3 * 24 * 60 * 60 and length < 3 * 30 * 24 * 60 * 60: # if the length longer than 3 days and shorter than 3 months
            trend = 'days'
        elif length >= 3 * 30 * 24 * 60 * 60: # if the length is longer than 3 months
            trend = 'months'
        return trend

[docs]    def TrendSubClass(self, SegmentStart, SegmentEnd, input_dir, input_file, output_dir, trend='months', norm=False):
        '''
        input file is supposed to be ClusteredGPSImportanceChannels.csv
        USAGE: TrendSubClass(SegmentStart, SegmentEnd, input_dir, input_file, output_dir, trend='month')

        :param SegmentStart: start GPS time of a segment
        :param SegmentEnd: end GPS time of a segment
        :param input_dir: input directory
        :param input_file: an input file
        :param output_dir: an output directory
        :param trend: a trend 'months', 'days', 'hours' or 'mins'
        :return: None
        '''
        # load an input file
        input_path = os.path.join(input_dir, input_file)
        try:
            Subclass_df = pd.read_csv(input_path)
        except IOError:
            print('There is no sub-class file. Skip the trend plot')
            return None
        # set a trend, month or day
        if trend == 'months':
            TrendInSecond = float(30 * 24 * 60 * 60)  # in second
        elif trend == 'days':
            TrendInSecond = float(24 * 60 * 60)  # in second
        elif trend == 'hours':
            TrendInSecond = float(60 * 60)  # in second
        elif trend == 'mins':
            TrendInSecond = float(60) # in second
        # add a column having time from the beginning of a segment in a trend scale
        Subclass_df = Subclass_df.assign(time=(Subclass_df['GPS'] - SegmentStart) / TrendInSecond)  # convert GPS time to time in a trend
        # a length of this segment in a trend scale
        SegmentLength = (SegmentEnd - SegmentStart) / TrendInSecond
        # bins of histogram
        Bins = int(round(SegmentLength))
        # non-duplicate sub-class labels
        UniqueClass = np.unique(Subclass_df['SubclassLabel']).astype(int)
        # make a list of each sub-class trend
        SubClassSeries = [Subclass_df[Subclass_df['SubclassLabel'] == subclass]['time'] for subclass in UniqueClass]
        LABEL = ['sub{}'.format(i) for i in UniqueClass]
        # making combinations of colors and linestyles
        ListColor = ['blue', 'red', 'cyan', 'gray', 'green', 'yellow']
        #ListLineStyle = ['-', '--', ':', '-.']
        ListLineStyle = ['-']
        ColorStyleCombination = list(itrtls.product(ListColor, ListLineStyle))
        rd.shuffle(ColorStyleCombination)
        # make a histogram
        if norm == False:
            for index in range(len(LABEL)):
                Count, edge = np.histogram(SubClassSeries[index], range=(0, SegmentLength), bins=Bins, density=False)  # without normalization
                d_edge = edge[1] - edge[0]
                Rate = Count/float(d_edge)
                #plt.plot(edge[:-1], Rate, drawstyle='steps-mid', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], label=LABEL[index])
                plt.plot(edge[:-1], Rate, marker='o', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], alpha=0.5, label=LABEL[index])
            plt.ylabel('Rate [/{}]'.format(trend.split('s')[0]), fontsize=15)
        if norm == True:
            for index in range(len(LABEL)):
                Count, edge = np.histogram(SubClassSeries[index], range=(0, SegmentLength), bins=Bins, density=True)  # without normalization
                d_edge = edge[1] - edge[0]
                Rate = Count/float(d_edge)
                #plt.plot(edge[:-1], Rate, drawstyle='steps', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], label=LABEL[index])
                plt.plot(edge[:-1], Rate, marker='o', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], alpha=0.5, label=LABEL[index])
            plt.ylabel('Normalized rate [/{}]'.format(trend), fontsize=15)
        TIME = tconvert(SegmentStart)
        plt.xlabel('Time [{0}] from {1}-{2:02d}-{3:02d} {4:02d}:{5:02d}:{6:02d} UTC'.format(trend, TIME.year, TIME.month, TIME.day, TIME.hour, TIME.minute, TIME.second), fontsize=15)
        plt.tick_params(axis="x", labelsize=10)
        plt.tick_params(axis="y", labelsize=10)
        plt.legend()
        if trend == 'months':
            output_file = 'SubClassTrend_Monthly.pdf'
        elif trend == 'days':
            output_file = 'SubClassTrend_Dayly.pdf'
        elif trend == 'hours':
            output_file = 'SubClassTrend_Hourly.pdf'
        elif trend == 'mins':
            output_file = 'SubClassTrend_minutes.pdf'
        output_path = os.path.join(output_dir, output_file)
        plt.savefig(output_path)
        plt.close()
        return None



[docs]    def make_subset_channel_based_on_samplingrate(self, g, target_sampling_rate):
        '''
        dependencies being upon: PlotIndividualFCS_ImportanceVSChannel()
        Usage: X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID = make_subset_channel_based_on_samplingrate(f['gps00000'], 256)

        :param g: target glitch class's group or a file itself (at a GPS time), HDF5 format
        :param target_sampling_rate: Sampling rate used to group them, (256,    512,   1024,   2048,   4096,   8192,  16384
        :return X: matrix comprising some channels with a same samplign rate of
                a target glitch class at a given time whitened a reference, only
                duration: duration of time series
                Listch_label_num: the list of channel labels
                ListChannel: the list of name of channels
                GPS: GPS time of this group
                SNR: SNR of this glitch
                confidence: confidence level of this glitch
                ID: GravitySpy uniqu ID
        '''
        GPS = g.attrs['GPS']  # GPS time
        try:
            SNR = g.attrs['snr']  # SNR
            confidence = g.attrs['confidence']  # confidence level
            try:
                ID = g.attrs['id'].decode('utf-8')  # GravitySpy id
            except AttributeError:
                ID = None
        except KeyError:
            SNR = 0
            confidence = 0
            ID = 'None'

        Listch_label_num = []  # a list having the channel labels
        ListChannel = []  # a list having the name of channels
        X = np.array([])
        typeSeg = 'T'
        for ch_label in g.keys():
            if str(ch_label)[2] == typeSeg:
                dset = g[ch_label]
                sr = dset.attrs['SamplingRate']
                duration = dset.attrs['duration']
                channel = dset.attrs['channel'].decode('utf-8')
                ##########  remove noisemon channels #############
                try:
                    if channel.split('-')[1].split('_')[2] == 'NOISEMON':
                        continue
                except IndexError:
                    pass
                ######################
                if sr == target_sampling_rate:
                    spT = dset.value # whitened FFT values
                    if np.isnan(spT[0]) == True:
                        spT = np.zeros(spT.shape[0])
                    pT = spT#**2 # whitened (Amplitude) Power series
                    pT = pT.reshape((1, pT.shape[0])) # reshape the vector suitable to make a matrix
                    # print(white_scale_psd.shape)
                    X = np.vstack([X, pT]) if X.size else pT
                    Listch_label_num.append(int(ch_label.split('ch{}'.format(typeSeg))[1]))
                    ListChannel.append(channel)
                else:
                    continue
        return X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID



[docs]    def PlotIndividualFCS_ImportanceVSChannel(self, glitchtype, IFO, GravitySpy_df, output_dir, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None', mode='offline', Listsegments=None, re_sfchs=None, Data_outputpath=None, Data_outputfilename=None, PlusHOFT='False', number_process=None):
        '''
        description:
            1. load a file comprising all glitches in a class
            2. create a plot comprising frequency versus channel & importance versus channel
                - dependency:
                            self.CreateChannelTicks(ListChannel)
                            self.make_subset_channel_based_on_samplingrate()
                            self.CreateMatCount()
                            self.PlotImportanceVSChannel()
            3. save a plot
        dependencies: make_subset_channel_based_on_samplingrate(), CreateChannelTicks(), CreateMatCount(), PlotImportanceVSChannel()
        USAGE: PlotIndividualFCS_ImportanceVSChannel(glitchtype, IFO, output_dir, sigma, LowerCutOffFreq, UpperCutOffFreq)

        :param glitchtype: a type of glitch used for create a name of a plot
        :param IFO: # a type of IFO used in a name of a plot
        :param GravitySpy_df: a meta data of Gravity Spy in pandas frame
        :param output_dir: a output directory
        :param simga: an integer number used for the upper bound of BG noise
        :param LowerCutOffFreq: the lower cut-off frequency used in CreateMatCount, 'None' in default
        :param UpperCutOffFreq: the upper cut-off frequency used in CreateMatCount, 'None' in default
        :param mode: 'offline' or 'online'
        :param Listsegments: a list of allowed glitches, which is used for online mode only, None in default
        :param re_sfchs: a list of safe channels except unused channels, which is used for online mode only,  None in default
        :param Data_outputpath: a directory saving for a HDF5 file, which is used for online mode only,  None in default
        :param Data_outputfilename: a file saving for a HDF5 file, which is used for online mode only,  None in default
        :param PlusHOFT: whether to get data of HOFT {'True', 'False'}, which is used for online mode only, 'False' in default
        :param number_process: a number of processes in parallel, which is used for online mode only, None in default
        return None
        '''
        if mode == 'offline':
            for group_label in list(self.f.keys()): # iterate though all glitches in a class
                g = self.f[group_label]
                if len(g.keys()) == 0: # if no dataset in this group, go to a next iteration
                    continue
                if len(g.keys()) / 3 == int: # if there are missing channels, go to a next iteration
                    continue
                #++++++++++ determine color upper and lower values +++++++++++++++++
                SmallestValue = 1 # dummy global minimum
                LargestValue = 0  # dummy global maximum
                for ch_label in g.keys(): # iterate through channel data sets
                    typeSeg = 'T' # only for target data sets
                    if str(ch_label)[2] == typeSeg: # take only target data sets
                        dset = g[ch_label] # get data set
                        localMin = dset.value.min() # find a local minima
                        localMax = dset.value.max() # find a local maxima
                        if localMin <= SmallestValue:
                            SmallestValue = localMin # replace a global minimum with a local minima
                        if LargestValue <= localMax:
                            LargestValue = localMax # replace a global maximum with a local maxima
                # set figure size
                fig = plt.figure(figsize=(25, 20))
                # set outer grid of this plot
                Base_outer = GridSpec(2, 50, wspace=0.3, hspace=0.5)
                outer = GridSpecFromSubplotSpec(1, 7, subplot_spec=Base_outer[:49], wspace=0.3, hspace=0)
                #++++++++++++ plot frequency vs channels grouped based on their sampling rates ++++++++++++++++++++
                for fig_num, target_samplerate in enumerate([256, 512, 1024, 2048, 4096, 8192, 16384][:], 0): # iterate though all the sampling rates
                    # make matrix having channels in row, frequency in column, and list of extra paramters
                    X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID = self.make_subset_channel_based_on_samplingrate(g, target_samplerate)
                    if X.shape[0] != 0:
                        # transpose X (X.T) make this matrix having frequency in row, channel in column, but the frequency increases from the top to the bottom
                        X = X.T[::-1, :] # turn up side down by [::-1,:], to make the frequency values go from the bottom to the top
                        #+++++++++++ plot color indicator for channels on the top +++++++++++++++++++++++++++++
                        # color the background either with blakc or gray for each channel
                        Listcolor = ['black', 'gray'] * 20 * 7
                        # make grid of an inner plot
                        inner = GridSpecFromSubplotSpec(15, 1, subplot_spec=outer[fig_num], wspace=0.1, hspace=0)
                        # plot the color indicator for channel on the top of this plot
                        ax1 = fig.add_subplot(inner[:1, 0])  # First row, first column
                        CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannel)
                        for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
                            if color_ind == ListInd.shape[-1] - 2: # add a cell for at the end to make a plot nicer
                                ax1.axvspan(edgeLeft, edgeRight + 1, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
                            else:
                                ax1.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
                        ax1.axis(xmin=0, xmax=X.shape[-1])
                        ax1.set_xticks([]) # make ticks disappear
                        ax1.set_yticks([]) # make ticks disappear
                        #++++++++++ make a frequency versus channel plot +++++++++++++++++++
                        ax2 = fig.add_subplot(inner[1:, 0])  # First row, first column
                        ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, vmin=SmallestValue, vmax=LargestValue, origin='upper')
                        #ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, norm=LogNorm(vmin=SmallestValue, vmax=LargestValue)) # log scale
                        # make frequency ticks to be correct
                        cutoff_freq_left = 0
                        cutoff_freq_right = target_samplerate / 2
                        interval = 5
                        y_tics = np.linspace(cutoff_freq_left, cutoff_freq_right, interval, endpoint=True)[::-1].astype(int) # new ticks
                        ax2.set_yticks(np.linspace(0, X.shape[0] - 1, interval, endpoint=True)) # set y ticks locations
                        ax2.set_yticklabels(y_tics, size=15) # replace y ticks with a frequency values
                        ax2.set_xticks(CenterTicks) # locate x ticks
                        ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) # replace x ticks with channel names
                        ax2.xaxis.set_tick_params(labelbottom=False, labeltop=True, direction='in', pad=3, length=0, labelrotation=90, size=0, labelsize=10) # set x ticks paramters
                        ax2.grid(False)
                        # write y label only for the left pannel as the rest are duplicated
                        if fig_num == 0:
                            plt.ylabel('Frequency [Hz]', fontsize=20)
                    elif X.shape[0] == 0:
                        inner = GridSpecFromSubplotSpec(15, 1, subplot_spec=outer[fig_num], wspace=0.1, hspace=0)
                        ax2 = fig.add_subplot(inner[:, 0])
                        ax2.set_xticks([])
                        ax2.set_yticks([])
                        ax2.grid(False)
                        if fig_num == 0:
                            plt.ylabel('Frequency [Hz]', fontsize=20)
                ###### Make a color bar
                inner2 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[49], wspace=0.01, hspace=0.5)
                cmap = matplotlib.cm.viridis
                norm = matplotlib.colors.Normalize(vmin=SmallestValue, vmax=LargestValue)
                ax_bar = fig.add_subplot(inner2[0])
                cbar = matplotlib.colorbar.ColorbarBase(ax_bar, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
                ax_bar.grid(False)
                cbar.set_label('Whitened FFT', size=10)
                # get a URL of Q-transform of h(t)
                try:
                    URL = GravitySpy_df[GravitySpy_df['id'] == ID]['imgUrl'].tolist()[0]
                except IndexError: # deal with the one with no Q-transport plot of h(t)
                    URL = 'None'
                # give an overall title
                if LowerCutOffFreq == 'None' and UpperCutOffFreq == 'None':
                    plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s\n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, URL), fontsize=20)
                elif LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None':
                    plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}- Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), URL), fontsize=20)
                elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None':
                    plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = -{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(UpperCutOffFreq), URL), fontsize=20)
                elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None':
                    plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}-{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), int(UpperCutOffFreq), URL), fontsize=20)
                # x label for the above plot
                fig.text(0.5, 0.55, 'Auxiliary channel', ha='center', va='center', fontsize=20)
                #++++++++ plot importance versus channel
                inner3 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[50:], wspace=0.1, hspace=0.5)
                ax = fig.add_subplot(inner3[0])  # First row, first column
                # create a count matrix of this glitch
                MatCount, ListChannelName, ListSNR, ListConf = self.CreateMatCount(sigma, g, LowerCutOffFreq, UpperCutOffFreq)
                # actually plot it
                self.PlotImportanceVSChannel(MatCount, ListChannelName, None, None, ax=ax)
                output_file = '{}.pdf'.format(str(GPS).replace('.', '-'))
                output_path = os.path.join(output_dir, output_file)
                fig.savefig(output_path)
                plt.close()
                print('Completed plotting for a(n) {} glitch in {} at {}'.format(glitchtype, IFO, str(GPS)))
        elif mode == 'online':
            for g in SaveTargetAndBackGroundHDF5_ONLINE(Listsegments, re_sfchs, IFO, Data_outputpath, Data_outputfilename, number_process, PlusHOFT): # iterate group by group
                #++++++++++ determine color upper and lower values +++++++++++++++++
                SmallestValue = 1 # dummy global minimum
                LargestValue = 0  # dummy global maximum
                for ch_label in g.keys(): # iterate through channel data sets
                    typeSeg = 'T' # only for target data sets
                    if str(ch_label)[2] == typeSeg: # take only target data sets
                        dset = g[ch_label] # get data set
                        localMin = dset.value.min() # find a local minima
                        localMax = dset.value.max() # find a local maxima
                        if localMin <= SmallestValue:
                            SmallestValue = localMin # replace a global minimum with a local minima
                        if LargestValue <= localMax:
                            LargestValue = localMax # replace a global maximum with a local maxima
                # set figure size
                fig = plt.figure(figsize=(25, 20))
                # set outer grid of this plot
                Base_outer = GridSpec(2, 50, wspace=0.3, hspace=0.5)
                outer = GridSpecFromSubplotSpec(1, 7, subplot_spec=Base_outer[:49], wspace=0.3, hspace=0)
                #++++++++++++ plot frequency vs channels grouped based on their sampling rates ++++++++++++++++++++
                for fig_num, target_samplerate in enumerate([256, 512, 1024, 2048, 4096, 8192, 16384][:], 0): # iterate though all the sampling rates
                    # make matrix having channels in row, frequency in column, and list of extra paramters
                    X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID = self.make_subset_channel_based_on_samplingrate(g, target_samplerate)
                    # transpose X (X.T) make this matrix having frequency in row, channel in column, but the frequency increases from the top to the bottom
                    X = X.T[::-1, :] # turn up side down by [::-1,:], to make the frequency values go from the bottom to the top
                    #+++++++++++ plot color indicator for channels on the top +++++++++++++++++++++++++++++
                    # color the background either with blakc or gray for each channel
                    Listcolor = ['black', 'gray'] * 20 * 7
                    # make grid of an inner plot
                    inner = GridSpecFromSubplotSpec(15, 1, subplot_spec=outer[fig_num], wspace=0.1, hspace=0)
                    # plot the color indicator for channel on the top of this plot
                    ax1 = fig.add_subplot(inner[:1, 0])  # First row, first column
                    CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannel)
                    for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
                        if color_ind == ListInd.shape[-1] - 2: # add a cell for at the end to make a plot nicer
                            ax1.axvspan(edgeLeft, edgeRight + 1, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
                        else:
                            ax1.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
                    ax1.axis(xmin=0, xmax=X.shape[-1])
                    ax1.set_xticks([]) # make ticks disappear
                    ax1.set_yticks([]) # make ticks disappear
                    #++++++++++ make a frequency versus channel plot +++++++++++++++++++
                    ax2 = fig.add_subplot(inner[1:, 0])  # from the second row, first column
                    ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, vmin=SmallestValue, vmax=LargestValue, origin='upper')
                    #ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, norm=LogNorm(vmin=SmallestValue, vmax=LargestVlaue)) # log scale
                    # make frequency ticks to be correct
                    cutoff_freq_left = 0
                    cutoff_freq_right = target_samplerate / 2
                    interval = 5
                    y_tics = np.linspace(cutoff_freq_left, cutoff_freq_right, interval, endpoint=True)[::-1].astype(int) # new ticks
                    ax2.set_yticks(np.linspace(0, X.shape[0] - 1, interval, endpoint=True)) # set y ticks locations
                    ax2.set_yticklabels(y_tics, size=15) # replace y ticks with a frequency values
                    ax2.set_xticks(CenterTicks) # locate x ticks
                    ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) # replace x ticks with channel names
                    ax2.xaxis.set_tick_params(labelbottom=False, labeltop=True, direction='in', pad=3, length=0, labelrotation=90, size=0, labelsize=10) # set x ticks paramters
                    ax2.grid(False)
                    # write y label only for the left pannel as the rest are duplicated
                    if fig_num == 0:
                        plt.ylabel('Frequency [Hz]', fontsize=20)
                ###### Make a color bar
                inner2 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[49], wspace=0.01, hspace=0.5)
                cmap = matplotlib.cm.viridis
                norm = matplotlib.colors.Normalize(vmin=SmallestValue, vmax=LargestValue)
                ax_bar = fig.add_subplot(inner2[0])
                cbar = matplotlib.colorbar.ColorbarBase(ax_bar, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
                ax_bar.grid(False)
                cbar.set_label('Whitened FFT', size=10)
                #### make a title ######
                # get a URL of Q-transform of h(t)
                try:
                    URL = GravitySpy_df[GravitySpy_df['id'] == ID]['imgUrl'].tolist()[0]
                except IndexError: # deal with the one with no Q-transport plot of h(t)
                    URL = 'None'
                # give an overall title
                if LowerCutOffFreq == 'None' and UpperCutOffFreq == 'None':
                    plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s\n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, URL), fontsize=20)
                elif LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None':
                    plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}- Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), URL), fontsize=20)
                elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None':
                    plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = -{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(UpperCutOffFreq), URL), fontsize=20)
                elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None':
                    plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}-{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), int(UpperCutOffFreq), URL), fontsize=20)
                # x label for the above plot
                fig.text(0.5, 0.55, 'Auxiliary channel', ha='center', va='center', fontsize=20)
                #++++++++ plot importance versus channel
                inner3 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[50:], wspace=0.1, hspace=0.5)
                ax = fig.add_subplot(inner3[0])  # First row, first column
                # create a count matrix of this glitch
                MatCount, ListChannelName, ListSNR, ListConf = self.CreateMatCount(sigma, g, LowerCutOffFreq, UpperCutOffFreq)
                # actually plot it
                self.PlotImportanceVSChannel(MatCount, ListChannelName, None, None, ax=ax)
                output_file = '{}.pdf'.format(str(GPS).replace('.', '-'))
                output_path = os.path.join(output_dir, output_file)
                fig.savefig(output_path)
                plt.close()
                print('Completed saving a plot for a(n) {} glitch in {} at {}'.format(glitchtype, IFO, str(GPS)))
        return None


[docs]    def FrequencyBandColor(self, PathGlitchChannel_Low, PathGlitchChannel_Mid, PathGlitchChannel_High):
        '''
        description:
            1. load files that have a low, middle, high frequency band importances. (GPSImportanceChannels.csv)
            2. make a matrix that has RGB color based on each importance
            3. output this matrix
        USAGE: RGBMat = FrequencyBandColor(PathGlitchChannel_Low, PathGlitchChannel_Mid, PathGlitchChannel_High)

        :param PathGlitchChannel_Low: a path to a file that has a low frequency band
        :param PathGlitchChannel_Mid: a path to a file that has a middle frequency band
        :param PathGlitchChannel_High: a path to a file that has a high frequency band
        :return: RGBMat
        '''

        ### load data files
        try: # for low frequency band data
            dfLow = pd.read_csv(PathGlitchChannel_Low)
            dfLow = dfLow.drop(['Unnamed: 0', 'GPS'], axis=1)
        except IOError:
            print('There is not such a file in {}'.format(PathGlitchChannel_Low))
            sys.exit()

        try: # for mid frequency band data
            dfMid = pd.read_csv(PathGlitchChannel_Mid)
            dfMid = dfMid.drop(['Unnamed: 0', 'GPS'], axis=1)
        except IOError:
            print('There is not such a file in {}'.format(PathGlitchChannel_Mid))
            sys.exit()

        try:  # for high frequency band data
            dfHigh = pd.read_csv(PathGlitchChannel_High)
            dfHigh = dfHigh.drop(['Unnamed: 0', 'GPS'], axis=1)
        except IOError:
            print('There is not such a file in {}'.format(PathGlitchChannel_High))
            sys.exit()

        # make a matrix with all the elements equal to zero
        RGBMat = np.zeros((dfLow.values.shape[0], dfLow.values.shape[1], 3))
        # fill the each value as RGB colors
        # low freq is red, middle freq is green, high freq is blue
        for row_num in range(dfLow.shape[0]):
            for col_num in range(dfLow.shape[1]):
                RGBMat[row_num, col_num, 0] = dfLow.values[row_num, col_num]
                RGBMat[row_num, col_num, 1] = dfMid.values[row_num, col_num]
                RGBMat[row_num, col_num, 2] = dfHigh.values[row_num, col_num]
        return RGBMat