Source code for origli.utilities.veto_utilities

#!/usr/bin/env python
'''
file name: veto_utilities.py

this file contains the utilities to be used for finding veto channel
'''
##########################
# import module
###########################

try:
    import cv2
except:
    pass
import json
import numpy as np
import pandas as pd
import os
import matplotlib
from sklearn import mixture
from sklearn.decomposition import PCA
import h5py
import time
import random as rd
import xml.etree.ElementTree as ET # for XML file
import sys
from math import floor
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
import concurrent.futures
import itertools as itrtls
from math import factorial
from scipy.stats import ttest_ind




try:
   from gwpy.timeseries import TimeSeries, TimeSeriesDict
   from gwpy.segments import DataQualityFlag
   from gwtrigfind import find_trigger_files
   from gwpy.table import EventTable
   from gwpy.table.io.pycbc import filter_empty_files as filter_pycbc_live_files
   from gwpy.time import tconvert
except ImportError:
   print('There is not gwpy module yet')


matplotlib.use("Agg")
# matplotlib.rcParams["backend"] = "TkAgg"
# Test display
# try:
#     matplotlib.rcParams["backend"] = "TkAgg"
#     m_dis = os.environ["DISPLAY"]
# except KeyError:
#     print("Display problems. Set to Agg")
#     matplotlib.use('Agg')
#
import matplotlib.pyplot as plt
plt.rc('text', usetex=False) # this is used for fixing a gwpy compatible issue
from matplotlib.colors import LogNorm
from matplotlib.ticker import MaxNLocator

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
# custom modules
from origli.utilities.utilities import Multiprocess_whitening
from origli.utilities.utilities import PlotTableAnalysis, IdentifyGlitch # for plot figures and the external trigger generator, etc
from origli.utilities.multiband_search_utilities import CreateAllChannels_rho_multband # multi-band search tools
import origli.utilities.const as Const # for multi frequency bands defined in



__author__ = "Kentaro Mogushi"
__copyright__ = "Copyright 2019-2021, Kentaro Mogushi"
__credits__ = ["Line for credits"]
__license__ = "GPL"
__version__ = "0.1.0"
__maintainer__ = "K. Mogushi"
__email__ = "kentaro.mogushi@ligo.org"
__status__ = "Developing"





#--------------------------------------------------------------


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#take time series of safe channels and calculate the absolute values of whitened FFT
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


[docs]def HierarchyChannelAboveThreshold_single_channel(whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, duration, sampling_rate, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None'):
    '''
    description:
        calculate the importance: a fraction of frequency bins in a frequency range above an upper bound of the off-source window for a single channel
    USAGE: Count = HierarchyChannelAboveThreshold_single_channel(whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, duration, sampling_rate, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None')

    :param whitened_fft_target: whitened fft of the on-source window
    :param whitened_fft_PBG: whitened fft of the preceding off-source window
    :param whitened_fft_FBG: whitened fft of the following off-source window
    :param duration: a duration of the on-source window
    :param sampling_rate: sampling rate of a channel
    :param sigma: an integer to determine the upper bound of the off-source window
    :param LowerCutOffFreq: a lower cutoff frequency
    :param UpperCutOffFreq: an upper cutoff frequency
    :return: Count: importance
    '''
    dsetT = whitened_fft_target # a target data set
    dsetP = whitened_fft_PBG # a preceding BG data set
    dsetF = whitened_fft_FBG # a following BG data set

    try: # test if it is an array
        dsetP[0]
    except ValueError: # if it is None
        dsetP = 0 # assign an integer
    try: # test if it is an array
        dsetF[0]
    except ValueError: # if it is None
        dsetF = 0 # assign an integer
    spT = dsetT # target data set pixel value
    if type(dsetP) != int: # this is required in the condition for "Either" BG selection
        spP = dsetP # a preceding BG data set pixel value
    else:
        spP = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True:
    if type(dsetF) != int: # this is required in the condition for "Either" BG selection
        spF = dsetF  # a following BG data set pixel value
    else:
        spF = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True:
    #=======================
    if type(dsetP) != int:
        Number_PBG_trials = spP.shape[0] / spT.shape[0]
        spP = spP.reshape(int(Number_PBG_trials), spT.shape[0])
    if type(dsetF) != int:
        Number_FBG_trials = spF.shape[0] / spT.shape[0]
        spF = spF.reshape(int(Number_FBG_trials), spT.shape[0])
    #=======================
    #++++++ for the case where there is no data measured at a channel +++++++++++++++++++
    #++++++ That channel has no contribution to h(t) so that we can define Count = 0 +++++++++++
    #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True:  # if target and BG have data measured
        pT = spT #** 2 # power of target
        if type(dsetP) != int:
            pP = spP #** 2 # power of a preceding BG
        if type(dsetF) != int:
            pF = spF #** 2  # power of a following BG
        # low, high, or band pass filter if we know which frequency band is important
        if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter
            ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
            if sampling_rate / 2 <= LowerCutOffFreq:
                pT = np.array([0])
                pP = np.array([0])
                pF = np.array([0])
            else:
                LowerCutOffElement = int(LowerCutOffFreq*duration)
                if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector
                    pT = pT[LowerCutOffElement:]
                    if type(dsetP) != int:
                        pP = pP[:, LowerCutOffElement:]
                    if type(dsetF) != int:
                        pF = pF[:, LowerCutOffElement:]
                else: # if LowerCutOffElement is greater than the length of the vector
                    pT = np.array([0])
                    if type(dsetP) != int:
                        pP = np.array([0])
                    if type(dsetF) != int:
                        pF = np.array([0])
        elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter
            # even if UpperCutOffElement is greater than the length of the vector is works fine
            UpperCutOffElement = int(UpperCutOffFreq * duration)
            pT = pT[:UpperCutOffElement]
            if type(dsetP) != int:
                pP = pP[:, :UpperCutOffElement]
            if type(dsetF) != int:
                pF = pF[:, :UpperCutOffElement]
        elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None':  # band pass filter
            ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
            if sampling_rate / 2 <= LowerCutOffFreq:
                pT = np.array([0])
                pP = np.array([0])
                pF = np.array([0])
            else:
                LowerCutOffElement = int(LowerCutOffFreq * duration)
                UpperCutOffElement = int(UpperCutOffFreq * duration)
                if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector
                    pT = pT[LowerCutOffElement:UpperCutOffElement]
                    if type(dsetP) != int:
                        pP = pP[:, LowerCutOffElement:UpperCutOffElement]
                    if type(dsetF) != int:
                        pF = pF[:, LowerCutOffElement:UpperCutOffElement]
                else: # if LowCutOffElement is greater than the length of the vector
                    pT = np.array([0])
                    if type(dsetP) != int:
                        pP = np.array([0])
                    if type(dsetF) != int:
                        pF = np.array([0])
        else:
            pass
        if type(dsetP) != int and type(dsetF) != int:
            pB = np.concatenate((pP, pF), axis=0)
        elif type(dsetP) != int and type(dsetF) == int:
            pB = pP
        elif type(dsetP) == int and type(dsetF) != int:
            pB = pF
        GausCeiling = pB.mean() + sigma * pB.std(ddof=1) # gaussian noise ceiling
        try:
            Count = pT[pT > GausCeiling].shape[0] / float(pT.shape[0]) # a fraction of components above the ceiling
        except ZeroDivisionError: # in the situation where the number of the total elments is zero, I saw GausCeiling is nan. So this channel has mulfunction. I manually set Count = 0
            Count = 0
        # if the data is flat over the stretch of the segment, the upper bound threshold of BG called "GausCeiling" becomes zero (BG has to flactuate so that "GausCeiling" equal to zero is fake)
        # if so that importance called "Count" becomes 1. To prevent this fake importance, I set Count = 0 manually
        if GausCeiling == 0.:
            Count = 0
    else:
        Count = 0 # manually put zero because this channel has no contribution at this glitch ID
    return Count




[docs]def CreateRho(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, sigma, LowerCutOffFreq, UpperCutOffFreq):
    '''
    discription:
        1. calculate the whitened FFT of the on- and off-source window for a single channel
        2. compute the value of importance for a single channel

    :param full_timeseries: the full time series in gwpy object including on- and off source windows
    :param target_timeseries_start: the start time of the on-source window
    :param target_timeseries_end: the end time of the on-source window
    :param pre_background_start: the start time of the preceding off-source window
    :param pre_background_end: the end time of the preceding off-source window
    :param fol_background_start: the start time of the following off-source window
    :param fol_background_end: the end time of the following off-source window
    :param sigma: an interger to calculate the value of importance
    :param LowerCutOffFreq: a lower cutoff frequency
    :param UpperCutOffFreq: an upper cutoff frequency
    :return: Count: the importance: a fraction of frequency bins in a frequency range above an upper bound of the off-source window for a single channel
    '''
    # calculate the whitened FFT of the on- and off-source windows
    whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, duration = Multiprocess_whitening(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end)
    # calculate the importance
    Count = HierarchyChannelAboveThreshold_single_channel(whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, duration, sample_rate, sigma, LowerCutOffFreq, UpperCutOffFreq)
    return Count


[docs]def CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq):
    '''
    description:
        1. use a single glitch time
        2. query timeseries of all the channels around a glitch
        3. condition (whitening and compare the on- and off-source window)
        4. quantify all the channels (compute values of importance of all the channels)
    USAGE: List_Count, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq)

    :param Listsegment: a list of segment parameters
    :param IFO
    :param channels: a list of safe channels
    :param number_process: a number of processes in parallel
    :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}
    :param sigma: an integer to be used for calculating values of importance
    :param LowerCutOffFreq: a lower cutoff frequency
    :param UpperCutOffFreq: an upper cutoff frequency
    '''
    channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]

    IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID = Listsegment
    t0 = time.time()
    # extracting the target data and the proceeding and following backgrounds
    try: # deal with time series in bad state
        if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
            full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
        elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
            full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
        elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
            full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
        else:
            pass
        if PlusHOFT == 'True': # get the data of hoft
            if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is an available proceeding background and if there is an available following background
                full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
            elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0:  # if there is an available proceeding background and if there is NO available following background
                full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
            elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0:  # if there is NO available proceeding background and if there is an available following background
                full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
            else:
                pass
            full_timeseriesDict.append(full_timeseriesDict_hoft) # append
            del full_timeseriesDict_hoft # delete the hoft object
            channels = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
    except IndexError:
        # IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099)
        return 0, 0, 0, 0, 0, 0, 0
    except RuntimeError:
        return 0, 0, 0, 0, 0, 0, 0
    except IOError: # error when getting time series
        print('IOError')
        return 0, 0, 0, 0, 0, 0, 0
    except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
        print('ValueError')
        return 0, 0, 0, 0, 0, 0, 0
    try:
        List_Count = np.array([]) # a list of rho
        # whitened target and BG segments with multi processes
        with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
            for Count in executor.map(CreateRho,
                                    full_timeseriesDict.values(),
                                    itrtls.repeat(target_timeseries_start),
                                    itrtls.repeat(target_timeseries_end),
                                    itrtls.repeat(pre_background_start),
                                    itrtls.repeat(pre_background_end),
                                    itrtls.repeat(fol_background_start),
                                    itrtls.repeat(fol_background_end),
                                    itrtls.repeat(sigma),
                                    itrtls.repeat(LowerCutOffFreq),
                                    itrtls.repeat(UpperCutOffFreq)
                                    ):

                List_Count = np.append(List_Count, Count) # fill
    except ValueError:
        print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
        return 0, 0, 0, 0, 0, 0, 0
    except IOError: # IOError: Driver write request failed
        print('*** delete group gps{:05d} due to memory error  ***'.format(IndexSatisfied))
        return 0, 0, 0, 0, 0, 0, 0
    t1 = time.time()
    print('Time taken: {} s'.format(t1 - t0))
    return List_Count, re_sfchs, gpstime, duration, SNR, confi, ID  # pass the parameters


#
# def WitnessFinder(Listsegments, IFO, re_sfchs_init, sigma, number_process, first_chunk, tolerance, confidence_level, df_null, shuffle='True', PlusHOFT='False', LowerCutOffFreq='None', UpperCutOffFreq='None'):
#     '''
#     description:
#         1. use a list of glitches
#         2. analyze glitches of the number of 'first chunk' with all the channels of 're_sfchs_init'
#         3. perform one-sided binomial test and Welch one-sided t-test
#         4. reject the channel that do NOT pass the both tests, i.e., can not reject a hypothesis that a channel is consistent with null samples
#         5. calculate the error ratio of the t-value of the top ranking channel to the previous t-value
#         6. analyze a next glitch using the channels that pass the both tests
#         7. add the values of importance to the passed analyzed samples
#         8. repeat (3)-(7)
#         9. terminate the process when the error ratio reaches the tolerance
#     :param Listsegments: a list of glitches
#     :param IFO: ifo
#     :param re_sfchs_init: all thes safe channels to be used at the beginning
#     :param sigma: an integer to determine the upper bound of the off-source window
#     :param number_process: number processes of a machine
#     :param first_chunk: the number of samples to be used for the first chunk, where all the channels are to be used
#     :param tolerate: tolerance number to stop the analysis
#     :param confidence_level: confidence level for one-sided binomial test and Welch one-sided t-test
#     :param df_null: null samples in pandas frame, which is expected to already created
#     :param shuffle: boolen, whether shuffle the list of glitches
#     :param PlusHOFT: boolen, whether analysis hoft
#     :param LowerCutOffFreq: a lower cutoff frequency
#     :param UpperCutOffFreq: an upper cutoff frequency
#     :return:
#         re_sfchs: a list of channels that will have passed the tests untill the tolerance
#         MatCount: a matrix of importance of the channels that passed the test
#         list_Causal_passed_final: a list of witness ratio statistics of the channels that passed the tests
#         list_t_values_passed_final: a list of t-values of the channels that passed the tests
#     USAGE: re_sfchs, MatCount, list_Causal_passed_final, list_t_values_passed_final = WitnessFinder(Listsegments, IFO, re_sfchs_init, sigma, number_process, first_chunk, tolerance, confidence_level, df_null, shuffle=True, PlusHOFT='False', LowerCutOffFreq='None', UpperCutOffFreq='None')
#     '''
#     t0 = time.time()
#     re_sfchs = re_sfchs_init
#     # get a class object creating plots and table
#     plotter = PlotTableAnalysis()
#     index = 0 # index of a sample to be analyzed, first it is set to zero
#     MatCount = np.array([])
#     ListGPS = np.array([])
#     ListDuration = np.array([])
#     ListSNR = np.array([])
#     ListConfi = np.array([])
#     ListID = np.array([])
#     if shuffle == 'True':
#         rd.shuffle(Listsegments)
#     else:
#         pass
#     for Listsegment in Listsegments:
#         ListCount, channels, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq)
#         if type(ListCount) == int: # if some error occurs, List_rho is 0, go to the next iteration
#             continue
#         else: # if the rho is calculated correctly
#             MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount
#             if index == 0:
#                 MatCount = MatCount.reshape(1, MatCount.shape[-1])
#             ListGPS = np.append(ListGPS, gpstime)
#             ListDuration = np.append(ListDuration, duration)
#             ListSNR = np.append(ListSNR, SNR)
#             ListConfi = np.append(ListConfi, confi)
#             ListID = np.append(ListID, ID)
#             # concatenate a list of GPS times and importance of all the channels
#             MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), ListSNR.reshape(ListSNR.shape[0], 1), ListConfi.reshape(ListConfi.shape[0], 1), MatCount), axis=1)
#             # make a list of column names
#             columnName = np.array(['GPS', 'duration', 'SNR', 'confidence'])
#             columnName = np.append(columnName, re_sfchs)
#             # convert it to a pandas frame
#             df_target = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName)
#             if index >= first_chunk: # the sample size reaches more or equal to first_chunk
#                 #------- perform one-sided binomial test and Welch one-sided t-test -------
#                 list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, list_channels = plotter.find_meaing_ful_confidence(df_target, df_null, confidence_level, channels=re_sfchs)
#                 _, list_t_values_passed, list_t_values_failed, list_Test, confidence_level_used = plotter.perform_Welch_test(df_target, df_null, confidence_level, channels=re_sfchs)
#                 # ----- find the channel name that has the largest t-value
#                 max_t_value = list_t_values_passed.max() # largest t-value
#                 channel_max_t_value = re_sfchs[list_t_values_passed == max_t_value][0]
#                 #------- keep channels which pass both binomial and Welch t-test --------
#                 # find the indices of channels that pass binomial test
#                 index_channels_pass_bio = np.where(list_Causal_passed != 0)[0]
#                 # find the indices of channels that pass Welch t-test
#                 index_channels_pass_welch = np.where(list_t_values_passed != 0)[0]
#                 # find the common indices of channels that pass the both tests
#                 index_pass_both = np.intersect1d(index_channels_pass_bio, index_channels_pass_welch)
#                 # keep the channels that pass the both tests
#                 re_sfchs = re_sfchs[index_pass_both]
#                 # keep values of importance of channels that pass both tests in MatCount
#                 MatCount = MatCount[:, index_pass_both]
#                 # keep the t-values of the channel that pass Welch t-test
#                 df_t_value = pd.DataFrame(list_t_values_passed.reshape(1, list_t_values_passed.shape[-1])[:, index_pass_both], columns=re_sfchs)
#                 print('=' * 50)
#                 print('index: {}'.format(index))
#                 print('Number of channels that pass the tests with confidence level of {}%: {}'.format(confidence_level*100, index_pass_both.shape[0]))
#                 print('Largest t-value {} is given by {}'.format(max_t_value, channel_max_t_value))
#                 if index > first_chunk:
#                     ratio_t_test = ((df_t_value[channel_max_t_value] - df_t_value_previous[channel_max_t_value]) / df_t_value_previous[channel_max_t_value]).values[0]
#                     print('Ratio error of the current t-value to the previous of {}: {}'.format(channel_max_t_value, ratio_t_test))
#                     if abs(ratio_t_test) < tolerance:
#                         print('Terminate the analysis as the ratio error reached the tolerance of {}'.format(tolerance))
#                         break # break the loop
#                 else:
#                     pass
#                 df_t_value_previous = df_t_value
#             index += 1
#     list_Causal_passed_final = list_Causal_passed[index_pass_both]
#     list_t_values_passed_final = list_t_values_passed[index_pass_both]
#     t1 = time.time()
#     if abs(ratio_t_test) >= tolerance: # if not the error ratio does not reaches the tolerance
#         print('Terminate the analysis as all the samples have been analyzed')
#         print('The current ratio error {} does not pass the tolerance of {}'.format(ratio_t_test, tolerance))
#     else:
#         pass
#     print('Total analysis time taken for finding witness channels: {} s'.format(t1 - t0))
#     return re_sfchs, MatCount, list_Causal_passed_final, list_t_values_passed_final, df_target
#
#


[docs]def WitnessFinder(Listsegments, IFO, re_sfchs_init, sigma, number_process, first_chunk, tolerance, confidence_level, df_null, shuffle='True', PlusHOFT='False', LowerCutOffFreq='None', UpperCutOffFreq='None', freq_bands=Const.freq_bands):
    '''
    description:
        1. use a list of glitches
        2. analyze glitches of the number of 'first chunk' with all the channels of 're_sfchs_init'
        3. perform one-sided binomial test and Welch one-sided t-test
        4. reject the channel that do NOT pass the both tests, i.e., can not reject a hypothesis that a channel is consistent with null samples
        5. calculate the error ratio of the t-value of the top ranking channel to the previous t-value
        6. analyze a next glitch using the channels that pass the both tests
        7. add the values of importance to the passed analyzed samples
        8. repeat (3)-(7)
        9. terminate the process when the error ratio reaches the tolerance
    USAGE: re_sfchs, MatCount, list_Causal_passed_final, list_t_values_passed_final = WitnessFinder(Listsegments, IFO, re_sfchs_init, sigma, number_process, first_chunk, tolerance, confidence_level, df_null, shuffle=True, PlusHOFT='False', LowerCutOffFreq='None', UpperCutOffFreq='None')

    :param Listsegments: a list of glitches
    :param IFO: ifo
    :param re_sfchs_init: all thes safe channels to be used at the beginning
    :param sigma: an integer to determine the upper bound of the off-source window
    :param number_process: number processes of a machine
    :param first_chunk: the number of samples to be used for the first chunk, where all the channels are to be used
    :param tolerate: tolerance number to stop the     USAGE: List_Count, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq)
analysis
    :param confidence_level: confidence level for one-sided binomial test and Welch one-sided t-test
    :param df_null: null samples in pandas frame, which is expected to already created
    :param shuffle: boolen, whether shuffle the list of glitches
    :param PlusHOFT: boolen, whether analysis hoft
    :param LowerCutOffFreq: a lower cutoff frequency
    :param UpperCutOffFreq: an upper cutoff frequency
    :return:
        re_sfchs: a list of channels that will have passed the tests untill the tolerance
        MatCount: a matrix of importance of the channels that passed the test
        list_Causal_passed_final: a list of witness ratio statistics of the channels that passed the tests
        list_t_values_passed_final: a list of t-values of the channels that passed the tests
    '''
    t0 = time.time()
    re_sfchs = re_sfchs_init
    # get a class object creating plots and table
    plotter = PlotTableAnalysis()
    index = 0 # index of a sample to be analyzed, first it is set to zero
    MatCount = np.array([])
    ListGPS = np.array([])
    ListDuration = np.array([])
    ListSNR = np.array([])
    ListConfi = np.array([])
    ListID = np.array([])
    if shuffle == 'True':
        rd.shuffle(Listsegments)
    else:
        pass
    for Listsegment in Listsegments:
        if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband": # multi-frequency band search is chosen
            re_sfchs_previous = re_sfchs # keep the channels for the later
            if re_sfchs[0][-3:-1] == "BP": # if the channel names are constructed with frequency-bands
                # get rid of _BP# and the end of channel names.
                re_sfchs_queried = np.array([ch.rsplit("_BP", 1)[0] for ch in re_sfchs])
                # get the channel name without duplication
                re_sfchs_queried = np.unique(re_sfchs_queried)
                # query data and calculate values of importance and etc
                IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband(Listsegment, IFO, re_sfchs_queried, number_process, PlusHOFT, sigma)
            else: # at the first time of querying data (at the first iteration, channel has no "BP#" at the end of channel name)
                # query data and calculate values of importance and etc
                IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma)
            # make a matrix of channels with band information added to the end of the channel name, where frequency bands are in rows from top to bottom and channels in columes
            re_sfchs = np.array([np.core.defchararray.add(re_sfchs, '_BP{}'.format(int(i))) for i in range(len(freq_bands))])
            re_sfchs = re_sfchs.flatten(order='F')  # list of revised channel names where ChannlA_BP0, ChannlA_BP1, ..., ChannlB_BP0
            ListCount = Mat_Count_in_multibands.flatten(order='F')  # list of rhos, in the structure of (rho of channel A in band 0, rho of channel A in band 1, ..., rho of channel B in band 0)
            if re_sfchs_previous[0][-3:-1] == "BP":  # if the channel names are constructed with frequency-bands
                # keep the channels in the bands that pass the test in the previous round
                keep_index = np.array([np.where(re_sfchs == re_sfch_previous)[0][0] for re_sfch_previous in re_sfchs_previous])
                re_sfchs = re_sfchs[keep_index]
                ListCount = ListCount[keep_index]
        else:
            ListCount, channels, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq)
        if type(ListCount) == int: # if some error occurs, List_rho is 0, go to the next iteration
            continue
        else: # if the rho is calculated correctly
            MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount
            if index == 0:
                MatCount = MatCount.reshape(1, MatCount.shape[-1])
            ListGPS = np.append(ListGPS, gpstime)
            ListDuration = np.append(ListDuration, duration)
            ListSNR = np.append(ListSNR, SNR)
            ListConfi = np.append(ListConfi, confi)
            ListID = np.append(ListID, ID)
            # concatenate a list of GPS times and importance of all the channels
            MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), ListSNR.reshape(ListSNR.shape[0], 1), ListConfi.reshape(ListConfi.shape[0], 1), MatCount), axis=1)
            # make a list of column names
            columnName = np.array(['GPS', 'duration', 'SNR', 'confidence'])
            columnName = np.append(columnName, re_sfchs)
            # convert it to a pandas frame
            df_target = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName)
            if index >= first_chunk: # the sample size reaches more or equal to first_chunk
                #------- perform one-sided binomial test and Welch one-sided t-test -------
                list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, list_channels = plotter.find_meaing_ful_confidence(df_target, df_null, confidence_level, channels=re_sfchs)
                _, list_t_values_passed, list_t_values_failed, list_Test, confidence_level_used = plotter.perform_Welch_test(df_target, df_null, confidence_level, channels=re_sfchs)
                # ----- find the channel name that has the largest t-value
                max_t_value = list_t_values_passed.max() # largest t-value
                channel_max_t_value = re_sfchs[list_t_values_passed == max_t_value][0]
                #------- keep channels which pass both binomial and Welch t-test --------
                # find the indices of channels that pass binomial test
                index_channels_pass_bio = np.where(list_Causal_passed != 0)[0]
                # find the indices of channels that pass Welch t-test
                index_channels_pass_welch = np.where(list_t_values_passed != 0)[0]
                # find the common indices of channels that pass the both tests
                index_pass_both = np.intersect1d(index_channels_pass_bio, index_channels_pass_welch)
                # keep the channels that pass the both tests
                re_sfchs = re_sfchs[index_pass_both]
                # keep values of importance of channels that pass both tests in MatCount
                MatCount = MatCount[:, index_pass_both]
                # keep the t-values of the channel that pass Welch t-test
                df_t_value = pd.DataFrame(list_t_values_passed.reshape(1, list_t_values_passed.shape[-1])[:, index_pass_both], columns=re_sfchs)
                print('=' * 50)
                print('index: {}'.format(index))
                print('Number of channels that pass the tests with confidence level of {}%: {}'.format(confidence_level*100, index_pass_both.shape[0]))
                print('Largest t-value {} is given by {}'.format(max_t_value, channel_max_t_value))
                if index > first_chunk:
                    ratio_t_test = ((df_t_value[channel_max_t_value] - df_t_value_previous[channel_max_t_value]) / df_t_value_previous[channel_max_t_value]).values[0]
                    print('Ratio error of the current t-value to the previous of {}: {}'.format(channel_max_t_value, ratio_t_test))
                    if abs(ratio_t_test) < tolerance:
                        print('Terminate the analysis as the ratio error reached the tolerance of {}'.format(tolerance))
                        break # break the loop
                else:
                    pass
                df_t_value_previous = df_t_value
            index += 1
    list_Causal_passed_final = list_Causal_passed[index_pass_both]
    list_t_values_passed_final = list_t_values_passed[index_pass_both]
    t1 = time.time()
    if abs(ratio_t_test) >= tolerance: # if not the error ratio does not reaches the tolerance
        print('Terminate the analysis as all the samples have been analyzed')
        print('The current ratio error {} does not pass the tolerance of {}'.format(ratio_t_test, tolerance))
    else:
        pass
    print('Total analysis time taken for finding witness channels: {} s'.format(t1 - t0))
    return re_sfchs, MatCount, list_Causal_passed_final, list_t_values_passed_final, df_target




[docs]def BackgroundCut(df_null, channel, background_upper_cut):
    '''
    description:
        calculate the upper cut of channels using the FAP distribution
    USAGE: cut = BackgroundCut(df_null, channel, background_upper_cut)

    :param df_null: null samples in pandas frame
    :param channel: a list of channels
    :param background_upper_cut: confidence level of the uppercut of null samples of witness channel(s), e.g.,  1sigma = 0.68268, 2sigma = 0.95449, 3sigma = 0.997300204, 4sigma = 0.99993666, and 5simga = 99.9999426
    :return: cut: an uuper cut of the null sample of those channels
    '''
    count, edges = np.histogram(df_null[channel], bins=500, range=(0, 1))
    delta_edge = edges[1] - edges[0]
    edg_cent = edges[:-1] + delta_edge/2.
    totalN = float(df_null.shape[0])
    frac = count / totalN # fraction of samples
    FAP = np.cumsum(frac[::-1])[::-1] # cumulative number of fraction of samples in ascending order from the right
    #percentile = FAP[FAP < 1-background_upper_cut][0]
    percentile = FAP[FAP > 1 - background_upper_cut][-1]
    index = np.where(FAP == percentile)[0][0]
    cut = edg_cent[index]
    return cut


[docs]def FlagFinder(Epoch_lt, Listsegments, IFO, channels, list_statistics, num_high_rank_channels_to_be_used, df_null, background_upper_cut, number_process, sigma, PlusHOFT, LowerCutOffFreq, UpperCutOffFreq, freq_bands=Const.freq_bands):
    '''
    description:
        1. select high ranking witness channels
        2. determine the upper cut of the null samples for those high ranking witness channels
        3. analyze all the glitches using the selected witness channels
        4. make a flag when those channels give importance above the upper cut of the null (flags are made if ALL the chosen witness have value of importance above the upper cut of the null samples)
        5. calculate efficiency and deadtime
    USAGE:  efficiency, deadtime_frac, df = FlagFinder(Epoch_lt, Listsegments, IFO, channels, list_statistics, num_high_rank_channels_to_be_used, df_null, background_upper_cut, number_process, sigma, PlusHOFT, LowerCutOffFreq, UpperCutOffFreq)

    :param Epoch_lt: a list of an epoch
    :param Listsegments: a list of glitches
    :param IFO: ifo
    :param channels: a list of channels, which are expected to be witness channels
    :param list_statistics: a list of ranking statistics, either witness ratio statistics or t-value
    :param num_high_rank_channels_to_be_used: number of high ranking channels to be used for making flag
    :param df_null: null samples in pandas frame
    :param background_upper_cut: confidence level of the uppercut of null samples of witness channel(s), e.g.,  1sigma = 0.68268, 2sigma = 0.95449, 3sigma = 0.997300204, 4sigma = 0.99993666, and 5simga = 99.9999426
    :param number_process: number of processors
    :param sigma: an integer to determine the upper bound of the off-source window
    :param PlusHOFT: boolen, whether analyze hoft
    :param LowerCutOffFreq: a lower cutoff frequency
    :param UpperCutOffFreq: an upper cutoff frequency
    :return:
        efficiency: a ratio of glitches that are flagged to total glitches analyzed without an issue of no data available
        deadtime_frac: a ratio of total on-source windows to the total analysis time
        df: a matrix of GPS, duration, SNR, confidence level of classification of glitches, flag and importance in pandas frame
    '''
    t0 = time.time()
    nummber_channels = channels.shape[0]
    if num_high_rank_channels_to_be_used > nummber_channels:
        num_high_rank_channels_to_be_used = nummber_channels
    else:
        pass
    #----- find the high ranking channels
    # sort the channels based the statistics
    print(channels)
    Ranked_channels = [channels[i] for i in np.argsort(list_statistics)[::-1]]
    # sort the statistics
    Ranked_statistics = [list_statistics[i] for i in np.argsort(list_statistics)[::-1]]
    # take the high ranking channels
    Ranked_channels_to_be_used = Ranked_channels[:num_high_rank_channels_to_be_used]
    # determine the lower cut of the high ranking channels
    print('Witness channel(s) used for flag')
    print('{}% confidence level for the upper cut of the background distribution'.format(background_upper_cut*100))
    cuts = np.array([])
    for Ranked_channel in Ranked_channels_to_be_used:
        cut = BackgroundCut(df_null, Ranked_channel, background_upper_cut)
        cuts = np.append(cuts, cut)
        print('{} with background cut of {}'.format(Ranked_channel, cut))
    #---- analyze glitches ------------
    index = 1
    MatCount = np.array([])
    ListGPS = np.array([])
    ListDuration = np.array([])
    ListSNR = np.array([])
    ListConfi = np.array([])
    ListID = np.array([])
    ListFlag = np.array([])
    deadtime = 0
    veto = 0
    if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband":  # multi-frequency band search is chosen
        # get rid of _BP# and the end of channel names.
        Ranked_channels_to_be_used_queried = np.array([ch.rsplit("_BP", 1)[0] for ch in Ranked_channels_to_be_used])
        # get the channel name without duplication
        Ranked_channels_to_be_used_queried = np.unique(Ranked_channels_to_be_used_queried)
    for Listsegment in Listsegments:
        if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband":  # multi-frequency band search is chosen
            IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband(Listsegment, IFO, Ranked_channels_to_be_used_queried, number_process, PlusHOFT, sigma)
            # make a matrix of channels with band information added to the end of the channel name, where frequency bands are in rows from top to bottom and channels in columes
            re_sfchs = np.array([np.core.defchararray.add(re_sfchs, '_BP{}'.format(int(i))) for i in range(len(freq_bands))])
            re_sfchs = re_sfchs.flatten(order='F')  # list of revised channel names where ChannlA_BP0, ChannlA_BP1, ..., ChannlB_BP0
            ListCount = Mat_Count_in_multibands.flatten(order='F')  # list of rhos, in the structure of (rho of channel A in band 0, rho of channel A in band 1, ..., rho of channel B in band 0)
            # keep the channels in the bands
            keep_index = np.array([np.where(re_sfchs == Ranked_channel_to_be_used)[0][0] for Ranked_channel_to_be_used in Ranked_channels_to_be_used])
            ListCount = ListCount[keep_index]
        else:
            ListCount, channels, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, Ranked_channels_to_be_used, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq)
        if type(ListCount) == int: # if some error occurs, List_rho is None, go to the next iteration
            continue
        else: # if the rho is calculated correctly
            MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount
            #if MatCount.shape[-1] == 1:
            if index == 1:
                MatCount = MatCount.reshape(1, MatCount.shape[-1])
            ListGPS = np.append(ListGPS, gpstime)
            ListDuration = np.append(ListDuration, duration)
            ListSNR = np.append(ListSNR, SNR)
            ListConfi = np.append(ListConfi, confi)
            ListID = np.append(ListID, ID)
            index += 1
            if (ListCount >  cuts).all() == True: # if all the high ranking channel have values of importance above cuts
                print('Index {}: A glitch at GPS = {} is flagged'.format(index, gpstime))
                veto += 1
                deadtime += duration
                ListFlag = np.append(ListFlag, 'Y')
            else:
                print('Index {}: A glitch at GPS = {} is NOT flagged'.format(index, gpstime))
                ListFlag = np.append(ListFlag, 'N')
    efficiency = veto / float(index)
    deadtime_frac = deadtime / float(Epoch_lt[-1][-1] - Epoch_lt[0][0])
    # concatenate a list of GPS times and importance of all the channels
    MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), ListSNR.reshape(ListSNR.shape[0], 1), ListConfi.reshape(ListConfi.shape[0], 1), ListFlag.reshape(ListFlag.shape[0], 1), MatCount), axis=1)
    # make a list of column names
    columnName = np.array(['GPS', 'duration', 'SNR', 'confidence', 'flag'])
    columnName = np.append(columnName, Ranked_channels_to_be_used)
    # convert it to a pandas frame
    df = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName)
    t1 = time.time()
    print('Analysis time taken: {} s'.format(t1 - t0))
    return efficiency, deadtime_frac, df


[docs]def FlagFinder_all_witnesses(Proportion_Duration_Bfr_Centr, Listsegments, IFO, channels, list_statistics, num_high_rank_channels_to_be_used, df_null, background_upper_cut, number_process, sigma, PlusHOFT, LowerCutOffFreq, UpperCutOffFreq, freq_bands=Const.freq_bands):
    '''
    description:
        1. select high ranking witness channels
        2. determine the upper cut of the null samples for those high ranking witness channels
        3. analyze all the glitches using the selected witness channels
        4. make flags when those channels give importance above the upper cut of the null (flags are make for individual channels)
    USAGE:  df_flag = FlagFinder(Proportion_Duration_Bfr_Centr, Listsegments, IFO, channels, list_statistics, num_high_rank_channels_to_be_used, df_null, background_upper_cut, number_process, sigma, PlusHOFT, LowerCutOffFreq, UpperCutOffFreq)

    :param Proportion_Duration_Bfr_Centr: a fraction of the on-source window before the peak GPS time
    :param Listsegments: a list of glitches
    :param IFO: ifo
    :param channels: a list of channels, which are expected to be witness channels
    :param list_statistics: a list of ranking statistics, either witness ratio statistics or t-value
    :param num_high_rank_channels_to_be_used: number of high ranking channels to be used for making flag
    :param df_null: null samples in pandas frame
    :param background_upper_cut: confidence level of the uppercut of null samples of witness channel(s), e.g.,  1sigma = 0.68268, 2sigma = 0.95449, 3sigma = 0.997300204, 4sigma = 0.99993666, and 5simga = 99.9999426
    :param number_process: number of processors
    :param sigma: an integer to determine the upper bound of the off-source window
    :param PlusHOFT: boolen, whether analyze hoft
    :param LowerCutOffFreq: a lower cutoff frequency
    :param UpperCutOffFreq: an upper cutoff frequency
    :return:
        df_flag: a matrix of GPS, duration, SNR, confidence level of classification of glitches, importance of the witness channels, the flags of the witness channels in pandas frame
    '''
    t0 = time.time()
    nummber_channels = channels.shape[0]
    if num_high_rank_channels_to_be_used > nummber_channels:
        num_high_rank_channels_to_be_used = nummber_channels
    else:
        pass
    #----- find the high ranking channels
    # sort the channels based the statistics
    Ranked_channels = [channels[i] for i in np.argsort(list_statistics)[::-1]]
    print(Ranked_channels)
    # sort the statistics
    Ranked_statistics = [list_statistics[i] for i in np.argsort(list_statistics)[::-1]]
    # take the high ranking channels
    Ranked_channels_to_be_used = Ranked_channels[:num_high_rank_channels_to_be_used]
    # determine the lower cut of the high ranking channels
    print('Witness channel(s) used for flag')
    print('{}% confidence level for the upper cut of the background distribution'.format(background_upper_cut*100))
    cuts = np.array([])
    for Ranked_channel in Ranked_channels_to_be_used:
        cut = BackgroundCut(df_null, Ranked_channel, background_upper_cut)
        cuts = np.append(cuts, cut)
        print('{} with background cut of {}'.format(Ranked_channel, cut))
    #---- analyze glitches ------------
    index = 1
    MatCount = np.array([])
    ListGPS = np.array([])
    ListDuration = np.array([])
    ListSNR = np.array([])
    ListConfi = np.array([])
    ListID = np.array([])

    #ListFlag = np.array([])
    MatFlag = {}
    for channel in Ranked_channels_to_be_used:
        MatFlag['{}_flag'.format(channel)] = []
    if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband":  # multi-frequency band search is chosen
        # get rid of _BP# and the end of channel names.
        Ranked_channels_to_be_used_queried = np.array([ch.rsplit("_BP", 1)[0] for ch in Ranked_channels_to_be_used])
        # get the channel name without duplication
        Ranked_channels_to_be_used_queried = np.unique(Ranked_channels_to_be_used_queried)
    for Listsegment in Listsegments:
        if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband":  # multi-frequency band search is chosen
            IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband(Listsegment, IFO, Ranked_channels_to_be_used_queried, number_process, PlusHOFT, sigma)
            # make a matrix of channels with band information added to the end of the channel name, where frequency bands are in rows from top to bottom and channels in columes
            re_sfchs = np.array([np.core.defchararray.add(re_sfchs, '_BP{}'.format(int(i))) for i in range(len(freq_bands))])
            re_sfchs = re_sfchs.flatten(order='F')  # list of revised channel names where ChannlA_BP0, ChannlA_BP1, ..., ChannlB_BP0
            ListCount = Mat_Count_in_multibands.flatten(order='F')  # list of rhos, in the structure of (rho of channel A in band 0, rho of channel A in band 1, ..., rho of channel B in band 0)
            # keep the channels in the bands
            keep_index = np.array([np.where(re_sfchs == Ranked_channel_to_be_used)[0][0] for Ranked_channel_to_be_used in Ranked_channels_to_be_used])
            ListCount = ListCount[keep_index]
        else:
            ListCount, channels, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, Ranked_channels_to_be_used, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq)
        if type(ListCount) == int: # if some error occurs, List_rho is None, go to the next iteration
            continue
        else: # if the rho is calculated correctly
            MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount
            #if MatCount.shape[-1] == 1:
            if index == 1:
                MatCount = MatCount.reshape(1, MatCount.shape[-1])
            ListGPS = np.append(ListGPS, gpstime)
            ListDuration = np.append(ListDuration, duration)
            ListSNR = np.append(ListSNR, SNR)
            ListConfi = np.append(ListConfi, confi)
            ListID = np.append(ListID, ID)
            index += 1
            for i in range(len(ListCount)):
                if ListCount[i] > cuts[i]:
                    MatFlag['{}_flag'.format(Ranked_channels_to_be_used[i])].append('Y')
                else:
                    MatFlag['{}_flag'.format(Ranked_channels_to_be_used[i])].append('N')
    # concatenate a list of GPS times and importance of all the channels
    MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), ListSNR.reshape(ListSNR.shape[0], 1), ListConfi.reshape(ListConfi.shape[0], 1), MatCount), axis=1)
    # make a list of column names
    columnName = np.array(['GPStime', 'duration', 'SNR', 'confidence'])
    columnName = np.append(columnName, Ranked_channels_to_be_used)
    # convert it to a pandas frame
    df = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName)
    df_flag = pd.DataFrame(MatFlag)
    df_flag = pd.concat([df, df_flag], axis=1)
    df_flag = df_flag.assign(start_time=(df_flag["GPStime"].astype("float") - df_flag.duration.astype("float") * Proportion_Duration_Bfr_Centr).round(3))
    df_flag = df_flag.assign(end_time=(df_flag["GPStime"].astype("float") + df_flag.duration.astype("float") * (1 - Proportion_Duration_Bfr_Centr)).round(3))
    t1 = time.time()
    print('Analysis time taken: {} s'.format(t1 - t0))
    return df_flag


[docs]def make_veto_omicron_in_aux(epoch_start, epoch_end, IFO, channel, df_foreground, glitch_type, Proportion_Duration_Bfr_Centr, SNR_thresh, df_flag, OutputHDF5_dir, ifostate, N_processes):
    '''
    description:
        1. query omicron triggers (aux) of a witness channel
        2. find the aux omicron triggers which are coincident with the glitches that are analyzed
        3. find the aux SNR cut which corresponds to the importance cut of this witness channel
        4. find the aux omicron triggers which are coincident with all the glitches with label being studied
        5. veto glitches when the coincident aux triggers have SNR above the aux SNR cut (given in the step 3)
    USAGE:  rho_cut, snr_cut, deadtime, efficiency, efficiency_over_deadtime, df_target = make_veto_omicron_in_aux(epoch_start, epoch_end, IFO, channel, df_foreground, glitch_type, Proportion_Duration_Bfr_Centr, SNR_thresh, df_flag, OutputHDF5_dir, ifostate, N_processes)

    :param epoch_start: start time of the analysis period
    :param epoch_end: end time of the analysis period
    :param IFO: ifo {'H1' or 'L1'}
    :param channel: a witness channel name, it could be a channel in a particular frequency band
    :param df_foreground: a pandas data frame of all the glitches in the strain channel fed into pychChoo
    :param glitch_type: a glitch type that is focused on
    :param Proportion_Duration_Bfr_Centr: a fraction the on-source window before the peak GPS time of a glitch
    :param SNR_thresh: a lower SNR threshold to select glitches that are studied
    :param df_flag: a pandas data frame of flagged (including 'Y' and 'N') of the witness channels for the glitches that have been analyzed with FlagFinder_all_witnesses()
    :param OutputHDF5_dir: a output directory where the omicron trigger of a witness channel
    :param ifostate: state of an ifo
    :param N_processes: number of cores
    :return:
        rho_cut: lower cut of importance of a witness channel
        snr_cut: corresponding SNR cut of this witness channel
        deadtime: a fraction of the time that are vetoed during the analysis time
        efficiency: a fraction of glitches that are vetoed
        efficiency_over_deadtime: ratio of efficiency over deadtime
        df_target: a pandas data frame of this witness, where the glitches that are vetoed are marked as 'Y'
    '''
    t0 = time.time()
    #---------------------------------------------------+
    # get the omicron trigger of the witness channel    |
    #---------------------------------------------------+
    indentify = IdentifyGlitch()
    if channel[-3:-1] == "BP": # if multi-band search is selected
        channel_original = channel.rsplit("_BP", 1)[0]
    else:
        channel_original = channel

    # get the state vector
    ListEpochs, trigger_dir = indentify.FindObservingTimeSegments(IFO, epoch_start, epoch_end, OutputHDF5_dir, state=ifostate)
    # get the omicron triggers of the witness channel
    indentify.GetTriggerMetadata(ListEpochs, IFO, trigger_dir, N_processes, trigger_pipeline='omicron', channel=channel_original)
    # get the omicron triggers
    path_metadata = os.path.join(trigger_dir, 'TriggerMetadata.csv')
    # load a meta data matrix
    df_aux = pd.read_csv(path_metadata)
    #-------------------------------------------------------------------+
    # all the glitches in h(t) within the epoch and the label chosen    |
    #-------------------------------------------------------------------+
    # all the glitches during the epoch
    df_foreground = df_foreground[(epoch_start <= df_foreground["GPStime"]) & (df_foreground["GPStime"] <= epoch_end)]  # all the glitches within the epoch
    # all the glitches during the epoch and the label is the one chosen
    df_target = df_foreground[(df_foreground["label"] == glitch_type) & (df_foreground["snr"] > SNR_thresh)]  # all the target samples
    # assgin the start and end time of the on-source windows
    df_target = df_target.assign(start_time=df_target["GPStime"] - Proportion_Duration_Bfr_Centr * df_target["duration"])
    df_target = df_target.assign(end_time=df_target["GPStime"] + (1 - Proportion_Duration_Bfr_Centr) * df_target["duration"])
    # drop duplicates
    df_flag = df_flag.drop_duplicates(subset=["GPStime", "duration"]).reset_index()
    df_target = df_target.drop_duplicates(subset=["GPStime", "duration"]).reset_index()
    #---------------------------------------------------+
    # mark triggers coincident with the flagged samples |
    #---------------------------------------------------+
    # mark triggers coincident with the flagged samples
    df_aux = df_aux.assign(flag_index=['None'] * df_aux.shape[0])
    df_aux = df_aux.assign(flag_rep_index=['None'] * df_aux.shape[0])
    for flag_index in range(df_flag.shape[0]): # iterate the all the flagged samples (subset samples )
        # start, end, and the peak GPS timeos of the flagged samples
        flag_start, flag_end, flag_center_GPS = df_flag.iloc[flag_index]["start_time"], df_flag.iloc[flag_index]["end_time"], df_flag.iloc[flag_index]["GPStime"]
        # mark the omicron trigggers of the witness channel that are within the flagged segment
        df_aux.loc[(flag_start <= df_aux["GPStime"]) & (df_aux["GPStime"] < flag_end), 'flag_index'] = flag_index
        # get the triggers within the on-source
        df_within = df_aux[(flag_start <= df_aux["GPStime"]) & (df_aux["GPStime"] < flag_end)]
        try:
            # values of SNR devided the difference of the trigger times (aux) and the peak time of the trigger (hoft)
            SNR_over_diff_abs = df_within["snr"] / abs(df_within["GPStime"] - flag_center_GPS)
            # the representative glitch is chosen such as to maximize the SNR / time difference
            index_maxima = SNR_over_diff_abs.index[SNR_over_diff_abs.argmax()]
            # mark the trigger as that flag index
            df_aux.at[index_maxima, "flag_rep_index"] = flag_index
        except ValueError: # if no triggers of the witness channel in the on-source is not found
            pass
            #print(flag_center_GPS, flag_end - flag_start)
            #print(flag_index)
    #----------------------------------------------------+
    # mark triggers coincident with the target samples   |
    #----------------------------------------------------+
    df_aux = df_aux.assign(target_index=['None'] * df_aux.shape[0])
    df_aux = df_aux.assign(target_rep_index=['None'] * df_aux.shape[0])
    for target_index in range(df_target.shape[0]): # iterate the target glitch samples
        # the start, end and the peak GPS time of the glitches (hoft)
        target_start, target_end, target_center_GPS = df_target.iloc[target_index]["start_time"], df_target.iloc[target_index]["end_time"], df_target.iloc[target_index]["GPStime"]
        # mark the omicron trigggers of the witness channel that are within the on-source window
        df_aux.loc[(target_start <= df_aux["GPStime"]) & (df_aux["GPStime"] < target_end), 'target_index'] = target_index
        # get the triggers within the on-source
        df_within = df_aux[(target_start <= df_aux["GPStime"]) & (df_aux["GPStime"] < target_end)]
        try:
            # values of SNR devided the difference of the trigger times (aux) and the peak time of the trigger (hoft)
            SNR_over_diff_abs = df_within["snr"] / abs(df_within["GPStime"] - target_center_GPS)
            # the representative glitch is chosen such as to maximize the SNR / time difference
            index_maxima = SNR_over_diff_abs.index[SNR_over_diff_abs.argmax()]
            # mark the trigger as that target index
            df_aux.at[index_maxima, "target_rep_index"] = target_index
        except ValueError: # if no triggers of the witness channel in the on-source is not found
            pass
            #print(target_index)
    #-------------------------------------------------------------------+
    # find the SNR (aux) cut, which corresponds to the importance cut   |
    #-------------------------------------------------------------------+
    rho_cut = df_flag[df_flag["{}_flag".format(channel)] == "Y"][channel].min() # the smallest value of importance of the glitches that are flagged in the witness channel
    rho_cut_index = df_flag.loc[df_flag[channel] == rho_cut].index[0] # corresponding the index of minimum SNR of the witness channel that are flagged
    snr_cut = df_aux[df_aux["flag_rep_index"] == rho_cut_index]["snr"].values[0] # corresponding the index of minimum SNR of the witness channel that are flagged
    #-------------------------------------+
    # find the efficiency and dead time   |
    #-------------------------------------+
    # find the indicies of the target samples where the triggers in the witness channel are coincident with the glitches in the hoft and the SNR_aux is above the SNR cut defined above
    target_vetoed_index = df_aux[(df_aux["target_rep_index"] != 'None') & (df_aux["snr"] >= snr_cut)]["target_rep_index"].values
    # calculate the dead time
    deadtime = np.sum(df_target.iloc[target_vetoed_index]['end_time'] - df_target.iloc[target_vetoed_index]['start_time']) / (epoch_end - epoch_start)
    efficiency = target_vetoed_index.shape[0] / float(df_target.shape[0])
    efficiency_over_deadtime = efficiency / deadtime
    # make a new column indicatign the veto
    df_target = df_target.assign(veto=['N']*df_target.shape[0])
    df_target.loc[target_vetoed_index, 'veto'] = 'Y'
    t1 = time.time()
    print('Analysis time taken: {} s'.format(t1 - t0))
    return rho_cut, snr_cut, deadtime, efficiency, efficiency_over_deadtime, df_target


#
#
# import pandas as pd
# import numpy as np
# import matplotlib
# matplotlib.use("Agg")
# import matplotlib.pyplot as plt
#
#
#
# # list of glitches that are analyzed (subset)
# # list of all the glitches
# # list of the omicron triggers of the witness channel
# indentify = IdentifyGlitch()
# ifostate = 'observing'
# # get the state vector
# ListEpochs, trigger_dir = indentify.FindObservingTimeSegments(ifo, startT, endT, OutputHDF5_dir, state=ifostate)
# # get the omicron triggers
# indentify.GetTriggerMetadata(ListEpochs, ifo, trigger_dir, N_processes, trigger_pipeline='omicron', channel='LSC-POP_A_LF_OUT_DQ')
# # get the omicron triggers
# path_metadata = os.path.join(trigger_dir, 'TriggerMetadata.csv')
#
#
# path_metadata = '/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/O3/H1/for_ryan/1248652818_1249862418/TriggerMetadata.csv'
# # load a meta data matrix
# df = pd.read_csv(path_metadata)
#
# epoch_start, epoch_end = 1248652818, 1249862418
# Proportion_Duration_Bfr_Centr = 0.5
# df_foreground = pd.read_csv("/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/GravityspyDataSet/O3/for_ryan/gspy.csv")
# df_foreground = df_foreground[(epoch_start <= df_foreground["GPStime"]) & (df_foreground["GPStime"] <= epoch_end)] # all the glitches within the epoch
# df_flag = pd.read_csv("/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/O3/H1/for_ryan/H1/LSC-POP_A_LF_OUT_DQ_BP0_flag.csv") # selected target samples that are flagged
# df_target = df_foreground[(df_foreground["label"] == 'Extremely_Loud') & (df_foreground["snr"] > 7.5)] # all the target samples
# df_target = df_target.assign(start_time=df_target["GPStime"] - Proportion_Duration_Bfr_Centr * df_target["duration"])
# df_target = df_target.assign(end_time=df_target["GPStime"] + (1 - Proportion_Duration_Bfr_Centr) * df_target["duration"])
# # drop duplicates
# df_flag = df_flag.drop_duplicates(subset=["GPS", "duration"]).reset_index()
# df_target = df_target.drop_duplicates(subset=["GPStime", "duration"]).reset_index()
#
#
#
#
#
# # mark triggers coincident with the flagged samples
# df = df.assign(flag_index=['None']*df.shape[0])
# df = df.assign(flag_rep_index=['None']*df.shape[0])
# list_df_trigger_within_flag = []
# for flag_index in range(df_flag.shape[0]):
#     flag_start, flag_end, flag_center_GPS = df_flag.iloc[flag_index]["start_time"], df_flag.iloc[flag_index]["end_time"], df_flag.iloc[flag_index]["GPS"]
#     df.loc[(flag_start <= df["GPStime"]) & (df["GPStime"] < flag_end), 'flag_index'] = flag_index
#     df_within = df[(flag_start <= df["GPStime"]) & (df["GPStime"] < flag_end)]
#     try:
#         SNR_over_diff_abs = df_within["snr"] / abs(df_within["GPStime"] - flag_center_GPS)
#         index_maxima = SNR_over_diff_abs.index[SNR_over_diff_abs.argmax()]
#         df.at[index_maxima, "flag_rep_index"] = flag_index
#     except ValueError:
#         print(flag_center_GPS, flag_end - flag_start)
#         print(flag_index)
#
#
# # mark triggers coincident with the target samples
# df = df.assign(target_index=['None']*df.shape[0])
# df = df.assign(target_rep_index=['None']*df.shape[0])
# for target_index in range(df_target.shape[0]):
#     target_start, target_end, target_center_GPS = df_target.iloc[target_index]["start_time"], df_target.iloc[target_index]["end_time"], df_target.iloc[target_index]["GPStime"]
#     df.loc[(target_start <= df["GPStime"]) & (df["GPStime"] < target_end), 'target_index'] = target_index
#     df_within = df[(target_start <= df["GPStime"]) & (df["GPStime"] < target_end)]
#     try:
#         SNR_over_diff_abs = df_within["snr"] / abs(df_within["GPStime"] - target_center_GPS)
#         index_maxima = SNR_over_diff_abs.index[SNR_over_diff_abs.argmax()]
#         df.at[index_maxima, "target_rep_index"] = target_index
#     except ValueError:
#         print(target_index)
#
#
#
#
# # snr histogram
# plt.hist(np.log10(df[df["flag_index"] != 'None']["snr"]), bins=50, density=True, label='flagged', histtype='step')
# plt.hist(np.log10(df[df["target_index"] != 'None']["snr"]), bins=50, density=True, label='target', histtype='step', linestyle='--')
# plt.legend()
# plt.ylabel("Normalized count")
# plt.xlabel("log 10 (SNR) of omicron trigger in hoft")
# plt.savefig("Hist_snr.png")
# plt.close()
#
# #-------------------
# # diff GPS and SNR
# #-------------------
# list_diff_time = np.array([])
# list_snr = np.array([])
# for flag_index in df[df["flag_index"] != 'None']["flag_index"].unique():
#     trig_GPStimes = df[df["flag_index"] == flag_index]["GPStime"].values
#     SNRs = df[df["flag_index"]==flag_index]["snr"].values
#     flag_GPStime = df_flag.iloc[flag_index]["GPS"]
#     diff_GPStime = trig_GPStimes - flag_GPStime
#     list_diff_time = np.append(list_diff_time, diff_GPStime)
#     list_snr = np.append(list_snr, SNRs)
#
#
# #-------------------
# # diff GPS and SNR (maxma)
# #-------------------
# list_diff_time_rep = np.array([])
# list_snr_rep = np.array([])
# for flag_index in df[df["flag_index"] != 'None']["flag_index"].unique():
#     trig_GPStimes = df[df["flag_index"] == flag_index]["GPStime"].values
#     SNRs = df[df["flag_index"]==flag_index]["snr"].values
#     flag_GPStime = df_flag.iloc[flag_index]["GPS"]
#     if SNR.shape[0] == 0:
#         print(flag_index)
#
#     diff_GPStime = trig_GPStimes - flag_GPStime
#     SNR_over_diff_abs = SNRs / np.abs(diff_GPStime)
#     max_index = np.argmax(SNR_over_diff_abs)
#     SNR_rep = SNRs[max_index]
#     diff_GPStime_rep = diff_GPStime[max_index]
#     list_diff_time_rep = np.append(list_diff_time_rep, diff_GPStime_rep)
#     list_snr_rep = np.append(list_snr_rep, SNR_rep)
#
# plt.scatter(list_diff_time, list_snr, s=3, label='all triggers within flags')
# plt.scatter(list_diff_time_rep, list_snr_rep, s=3, label='representaitive', marker='*', alpha=0.5)
# plt.axvline(0, linestyle='--', color='black')
# plt.xlabel("GPS time (aux trigger) - GPS time (hoft)")
# plt.ylabel('SNR')
# plt.yscale("log")
# plt.legend()
# plt.savefig("dist_diff_GPS_VS_snr.png")
# plt.close()
#
# list_imp_flag_yes = np.array([])
# list_snr_flag_yes = np.array([])
# list_imp_flag_no = np.array([])
# list_snr_flag_no = np.array([])
# for flag_index in range(df_flag.shape[0]):
#     imp = df_flag.iloc[flag_index]["LSC-POP_A_LF_OUT_DQ_BP0"]
#     if df_flag.iloc[flag_index]["flag"] == 'Y':
#         snr = df[df["flag_rep_index"] == flag_index]["snr"].values
#         try:
#             snr = snr[0]
#         except IndexError:
#             snr = 0
#             print(flag_index, df_flag.iloc[flag_index]["GPS"], df_flag.iloc[flag_index]["duration"])
#         list_imp_flag_yes = np.append(list_imp_flag_yes, imp)
#         list_snr_flag_yes = np.append(list_snr_flag_yes, snr)
#     else:
#         list_imp_flag_no = np.append(list_imp_flag_no, imp)
#         snr = df[df["flag_rep_index"] == flag_index]["snr"].values
#         try:
#             snr = snr[0]
#         except IndexError:
#             snr = 0
#         list_snr_flag_no = np.append(list_snr_flag_no, snr)
#
#
#
# plt.scatter(list_imp_flag_no[list_snr_flag_no!=0], list_snr_flag_no[list_snr_flag_no!=0], s=5, label='Not flagged', marker='o', alpha=0.5, color='grey')
# plt.scatter(list_imp_flag_yes[list_snr_flag_yes!=0], list_snr_flag_yes[list_snr_flag_yes!=0], s=5, label='flagged', marker='*')
# #plt.scatter(list_imp_flag_no[list_snr_flag_no==0], list_snr_flag_no[list_snr_flag_no==0], s=10, label='Not flagged, no trig ', marker='X', alpha=0.5, color='magenta')
# #plt.scatter(list_imp_flag_yes[list_snr_flag_yes==0], list_snr_flag_yes[list_snr_flag_yes==0], s=10, label='flagged, no trig', marker='X' )
# plt.legend()
# plt.yscale("log")
# plt.title("Witness LSC-POP_A_LF_OUT_DQ_BP0")
# plt.ylabel('SNR (aux)')
# plt.xlabel("Importance")
# plt.savefig("dist_imp_VS_snr.png")
# plt.close()
#
#
#
# # veto the target samples
# rho_cut = list_imp_flag_yes[np.argmin(list_imp_flag_yes)]
# snr_cut = list_snr_flag_yes[np.argmin(list_imp_flag_yes)]
# target_vetoed_index = df[(df["target_rep_index"] != 'None') & (df["snr"] >= snr_cut)]["target_rep_index"].values
# target_not_vetoed_index = df[(df["target_rep_index"] != 'None') & (df["snr"] < snr_cut)]["target_rep_index"].values
# deadtime = np.sum(df_target.iloc[target_vetoed_index]['end_time'] - df_target.iloc[target_vetoed_index]['start_time']) / (epoch_end - epoch_start)
# efficiency = target_vetoed_index.shape[0] / float(df_target.shape[0])
# efficiency_over_deadtime =  efficiency / deadtime
#
#
# plt.scatter(df_target["GPStime"], df_target["snr"], label='all')
# plt.scatter(df_target.iloc[target_vetoed_index]["GPStime"], df_target.iloc[target_vetoed_index]["snr"], label='whole')
# plt.scatter(df_target.iloc[target_not_vetoed_index]["GPStime"], df_target.iloc[target_not_vetoed_index]["snr"], label='not vetoed (coinc)')
# plt.axhline(snr_cut, color='black', linestyle='--')
# plt.xlabel("GPS time")
# plt.title('deadtime = {:.2e}, efficiency = {:.2f}, eff /dead = {:.2f} \n , LSC-POP_A_LF_OUT_DQ_BP0'.format(deadtime, efficiency, efficiency_over_deadtime))
# plt.ylabel('SNR (hoft)')
# plt.yscale('log')
# plt.legend()
# plt.savefig("target_sample_before_selection.png")
# plt.close()
#