Source code for origli.utilities.veto_utilities

#!/usr/bin/env python
'''
file name: veto_utilities.py

this file contains the utilities to be used for finding veto channel
'''
##########################
# import module
###########################

try:
    import cv2
except:
    pass
import json
import numpy as np
import pandas as pd
import os
import matplotlib
from sklearn import mixture
from sklearn.decomposition import PCA
import h5py
import time
import random as rd
import xml.etree.ElementTree as ET # for XML file
import sys
from math import floor
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
import concurrent.futures
import itertools as itrtls
from math import factorial
from scipy.stats import ttest_ind




try:
   from gwpy.timeseries import TimeSeries, TimeSeriesDict
   from gwpy.segments import DataQualityFlag
   from gwtrigfind import find_trigger_files
   from gwpy.table import EventTable
   from gwpy.table.io.pycbc import filter_empty_files as filter_pycbc_live_files
   from gwpy.time import tconvert
except ImportError:
   print('There is not gwpy module yet')


matplotlib.use("Agg")
# matplotlib.rcParams["backend"] = "TkAgg"
# Test display
# try:
#     matplotlib.rcParams["backend"] = "TkAgg"
#     m_dis = os.environ["DISPLAY"]
# except KeyError:
#     print("Display problems. Set to Agg")
#     matplotlib.use('Agg')
#
import matplotlib.pyplot as plt
plt.rc('text', usetex=False) # this is used for fixing a gwpy compatible issue
from matplotlib.colors import LogNorm
from matplotlib.ticker import MaxNLocator

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
# custom modules
from origli.utilities.utilities import Multiprocess_whitening
from origli.utilities.utilities import PlotTableAnalysis, IdentifyGlitch # for plot figures and the external trigger generator, etc
from origli.utilities.multiband_search_utilities import CreateAllChannels_rho_multband # multi-band search tools
import origli.utilities.const as Const # for multi frequency bands defined in



__author__ = "Kentaro Mogushi"
__copyright__ = "Copyright 2019-2021, Kentaro Mogushi"
__credits__ = ["Line for credits"]
__license__ = "GPL"
__version__ = "0.1.0"
__maintainer__ = "K. Mogushi"
__email__ = "kentaro.mogushi@ligo.org"
__status__ = "Developing"





#--------------------------------------------------------------


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#take time series of safe channels and calculate the absolute values of whitened FFT
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


[docs]def HierarchyChannelAboveThreshold_single_channel(whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, duration, sampling_rate, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None'): ''' description: calculate the importance: a fraction of frequency bins in a frequency range above an upper bound of the off-source window for a single channel USAGE: Count = HierarchyChannelAboveThreshold_single_channel(whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, duration, sampling_rate, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None') :param whitened_fft_target: whitened fft of the on-source window :param whitened_fft_PBG: whitened fft of the preceding off-source window :param whitened_fft_FBG: whitened fft of the following off-source window :param duration: a duration of the on-source window :param sampling_rate: sampling rate of a channel :param sigma: an integer to determine the upper bound of the off-source window :param LowerCutOffFreq: a lower cutoff frequency :param UpperCutOffFreq: an upper cutoff frequency :return: Count: importance ''' dsetT = whitened_fft_target # a target data set dsetP = whitened_fft_PBG # a preceding BG data set dsetF = whitened_fft_FBG # a following BG data set try: # test if it is an array dsetP[0] except ValueError: # if it is None dsetP = 0 # assign an integer try: # test if it is an array dsetF[0] except ValueError: # if it is None dsetF = 0 # assign an integer spT = dsetT # target data set pixel value if type(dsetP) != int: # this is required in the condition for "Either" BG selection spP = dsetP # a preceding BG data set pixel value else: spP = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True: if type(dsetF) != int: # this is required in the condition for "Either" BG selection spF = dsetF # a following BG data set pixel value else: spF = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True: #======================= if type(dsetP) != int: Number_PBG_trials = spP.shape[0] / spT.shape[0] spP = spP.reshape(int(Number_PBG_trials), spT.shape[0]) if type(dsetF) != int: Number_FBG_trials = spF.shape[0] / spT.shape[0] spF = spF.reshape(int(Number_FBG_trials), spT.shape[0]) #======================= #++++++ for the case where there is no data measured at a channel +++++++++++++++++++ #++++++ That channel has no contribution to h(t) so that we can define Count = 0 +++++++++++ #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True: # if target and BG have data measured pT = spT #** 2 # power of target if type(dsetP) != int: pP = spP #** 2 # power of a preceding BG if type(dsetF) != int: pF = spF #** 2 # power of a following BG # low, high, or band pass filter if we know which frequency band is important if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch) if sampling_rate / 2 <= LowerCutOffFreq: pT = np.array([0]) pP = np.array([0]) pF = np.array([0]) else: LowerCutOffElement = int(LowerCutOffFreq*duration) if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector pT = pT[LowerCutOffElement:] if type(dsetP) != int: pP = pP[:, LowerCutOffElement:] if type(dsetF) != int: pF = pF[:, LowerCutOffElement:] else: # if LowerCutOffElement is greater than the length of the vector pT = np.array([0]) if type(dsetP) != int: pP = np.array([0]) if type(dsetF) != int: pF = np.array([0]) elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter # even if UpperCutOffElement is greater than the length of the vector is works fine UpperCutOffElement = int(UpperCutOffFreq * duration) pT = pT[:UpperCutOffElement] if type(dsetP) != int: pP = pP[:, :UpperCutOffElement] if type(dsetF) != int: pF = pF[:, :UpperCutOffElement] elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None': # band pass filter ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch) if sampling_rate / 2 <= LowerCutOffFreq: pT = np.array([0]) pP = np.array([0]) pF = np.array([0]) else: LowerCutOffElement = int(LowerCutOffFreq * duration) UpperCutOffElement = int(UpperCutOffFreq * duration) if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector pT = pT[LowerCutOffElement:UpperCutOffElement] if type(dsetP) != int: pP = pP[:, LowerCutOffElement:UpperCutOffElement] if type(dsetF) != int: pF = pF[:, LowerCutOffElement:UpperCutOffElement] else: # if LowCutOffElement is greater than the length of the vector pT = np.array([0]) if type(dsetP) != int: pP = np.array([0]) if type(dsetF) != int: pF = np.array([0]) else: pass if type(dsetP) != int and type(dsetF) != int: pB = np.concatenate((pP, pF), axis=0) elif type(dsetP) != int and type(dsetF) == int: pB = pP elif type(dsetP) == int and type(dsetF) != int: pB = pF GausCeiling = pB.mean() + sigma * pB.std(ddof=1) # gaussian noise ceiling try: Count = pT[pT > GausCeiling].shape[0] / float(pT.shape[0]) # a fraction of components above the ceiling except ZeroDivisionError: # in the situation where the number of the total elments is zero, I saw GausCeiling is nan. So this channel has mulfunction. I manually set Count = 0 Count = 0 # if the data is flat over the stretch of the segment, the upper bound threshold of BG called "GausCeiling" becomes zero (BG has to flactuate so that "GausCeiling" equal to zero is fake) # if so that importance called "Count" becomes 1. To prevent this fake importance, I set Count = 0 manually if GausCeiling == 0.: Count = 0 else: Count = 0 # manually put zero because this channel has no contribution at this glitch ID return Count
[docs]def CreateRho(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, sigma, LowerCutOffFreq, UpperCutOffFreq): ''' discription: 1. calculate the whitened FFT of the on- and off-source window for a single channel 2. compute the value of importance for a single channel :param full_timeseries: the full time series in gwpy object including on- and off source windows :param target_timeseries_start: the start time of the on-source window :param target_timeseries_end: the end time of the on-source window :param pre_background_start: the start time of the preceding off-source window :param pre_background_end: the end time of the preceding off-source window :param fol_background_start: the start time of the following off-source window :param fol_background_end: the end time of the following off-source window :param sigma: an interger to calculate the value of importance :param LowerCutOffFreq: a lower cutoff frequency :param UpperCutOffFreq: an upper cutoff frequency :return: Count: the importance: a fraction of frequency bins in a frequency range above an upper bound of the off-source window for a single channel ''' # calculate the whitened FFT of the on- and off-source windows whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, duration = Multiprocess_whitening(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end) # calculate the importance Count = HierarchyChannelAboveThreshold_single_channel(whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, duration, sample_rate, sigma, LowerCutOffFreq, UpperCutOffFreq) return Count
[docs]def CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq): ''' description: 1. use a single glitch time 2. query timeseries of all the channels around a glitch 3. condition (whitening and compare the on- and off-source window) 4. quantify all the channels (compute values of importance of all the channels) USAGE: List_Count, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq) :param Listsegment: a list of segment parameters :param IFO :param channels: a list of safe channels :param number_process: a number of processes in parallel :param PlusHOFT: whether to get data of hoft, {'True' or 'False'} :param sigma: an integer to be used for calculating values of importance :param LowerCutOffFreq: a lower cutoff frequency :param UpperCutOffFreq: an upper cutoff frequency ''' channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs] IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID = Listsegment t0 = time.time() # extracting the target data and the proceeding and following backgrounds try: # deal with time series in bad state if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) else: pass if PlusHOFT == 'True': # get the data of hoft if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) else: pass full_timeseriesDict.append(full_timeseriesDict_hoft) # append del full_timeseriesDict_hoft # delete the hoft object channels = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO)) except IndexError: # IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099) return 0, 0, 0, 0, 0, 0, 0 except RuntimeError: return 0, 0, 0, 0, 0, 0, 0 except IOError: # error when getting time series print('IOError') return 0, 0, 0, 0, 0, 0, 0 except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ] print('ValueError') return 0, 0, 0, 0, 0, 0, 0 try: List_Count = np.array([]) # a list of rho # whitened target and BG segments with multi processes with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for Count in executor.map(CreateRho, full_timeseriesDict.values(), itrtls.repeat(target_timeseries_start), itrtls.repeat(target_timeseries_end), itrtls.repeat(pre_background_start), itrtls.repeat(pre_background_end), itrtls.repeat(fol_background_start), itrtls.repeat(fol_background_end), itrtls.repeat(sigma), itrtls.repeat(LowerCutOffFreq), itrtls.repeat(UpperCutOffFreq) ): List_Count = np.append(List_Count, Count) # fill except ValueError: print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied)) return 0, 0, 0, 0, 0, 0, 0 except IOError: # IOError: Driver write request failed print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied)) return 0, 0, 0, 0, 0, 0, 0 t1 = time.time() print('Time taken: {} s'.format(t1 - t0)) return List_Count, re_sfchs, gpstime, duration, SNR, confi, ID # pass the parameters
# # def WitnessFinder(Listsegments, IFO, re_sfchs_init, sigma, number_process, first_chunk, tolerance, confidence_level, df_null, shuffle='True', PlusHOFT='False', LowerCutOffFreq='None', UpperCutOffFreq='None'): # ''' # description: # 1. use a list of glitches # 2. analyze glitches of the number of 'first chunk' with all the channels of 're_sfchs_init' # 3. perform one-sided binomial test and Welch one-sided t-test # 4. reject the channel that do NOT pass the both tests, i.e., can not reject a hypothesis that a channel is consistent with null samples # 5. calculate the error ratio of the t-value of the top ranking channel to the previous t-value # 6. analyze a next glitch using the channels that pass the both tests # 7. add the values of importance to the passed analyzed samples # 8. repeat (3)-(7) # 9. terminate the process when the error ratio reaches the tolerance # :param Listsegments: a list of glitches # :param IFO: ifo # :param re_sfchs_init: all thes safe channels to be used at the beginning # :param sigma: an integer to determine the upper bound of the off-source window # :param number_process: number processes of a machine # :param first_chunk: the number of samples to be used for the first chunk, where all the channels are to be used # :param tolerate: tolerance number to stop the analysis # :param confidence_level: confidence level for one-sided binomial test and Welch one-sided t-test # :param df_null: null samples in pandas frame, which is expected to already created # :param shuffle: boolen, whether shuffle the list of glitches # :param PlusHOFT: boolen, whether analysis hoft # :param LowerCutOffFreq: a lower cutoff frequency # :param UpperCutOffFreq: an upper cutoff frequency # :return: # re_sfchs: a list of channels that will have passed the tests untill the tolerance # MatCount: a matrix of importance of the channels that passed the test # list_Causal_passed_final: a list of witness ratio statistics of the channels that passed the tests # list_t_values_passed_final: a list of t-values of the channels that passed the tests # USAGE: re_sfchs, MatCount, list_Causal_passed_final, list_t_values_passed_final = WitnessFinder(Listsegments, IFO, re_sfchs_init, sigma, number_process, first_chunk, tolerance, confidence_level, df_null, shuffle=True, PlusHOFT='False', LowerCutOffFreq='None', UpperCutOffFreq='None') # ''' # t0 = time.time() # re_sfchs = re_sfchs_init # # get a class object creating plots and table # plotter = PlotTableAnalysis() # index = 0 # index of a sample to be analyzed, first it is set to zero # MatCount = np.array([]) # ListGPS = np.array([]) # ListDuration = np.array([]) # ListSNR = np.array([]) # ListConfi = np.array([]) # ListID = np.array([]) # if shuffle == 'True': # rd.shuffle(Listsegments) # else: # pass # for Listsegment in Listsegments: # ListCount, channels, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq) # if type(ListCount) == int: # if some error occurs, List_rho is 0, go to the next iteration # continue # else: # if the rho is calculated correctly # MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount # if index == 0: # MatCount = MatCount.reshape(1, MatCount.shape[-1]) # ListGPS = np.append(ListGPS, gpstime) # ListDuration = np.append(ListDuration, duration) # ListSNR = np.append(ListSNR, SNR) # ListConfi = np.append(ListConfi, confi) # ListID = np.append(ListID, ID) # # concatenate a list of GPS times and importance of all the channels # MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), ListSNR.reshape(ListSNR.shape[0], 1), ListConfi.reshape(ListConfi.shape[0], 1), MatCount), axis=1) # # make a list of column names # columnName = np.array(['GPS', 'duration', 'SNR', 'confidence']) # columnName = np.append(columnName, re_sfchs) # # convert it to a pandas frame # df_target = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName) # if index >= first_chunk: # the sample size reaches more or equal to first_chunk # #------- perform one-sided binomial test and Welch one-sided t-test ------- # list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, list_channels = plotter.find_meaing_ful_confidence(df_target, df_null, confidence_level, channels=re_sfchs) # _, list_t_values_passed, list_t_values_failed, list_Test, confidence_level_used = plotter.perform_Welch_test(df_target, df_null, confidence_level, channels=re_sfchs) # # ----- find the channel name that has the largest t-value # max_t_value = list_t_values_passed.max() # largest t-value # channel_max_t_value = re_sfchs[list_t_values_passed == max_t_value][0] # #------- keep channels which pass both binomial and Welch t-test -------- # # find the indices of channels that pass binomial test # index_channels_pass_bio = np.where(list_Causal_passed != 0)[0] # # find the indices of channels that pass Welch t-test # index_channels_pass_welch = np.where(list_t_values_passed != 0)[0] # # find the common indices of channels that pass the both tests # index_pass_both = np.intersect1d(index_channels_pass_bio, index_channels_pass_welch) # # keep the channels that pass the both tests # re_sfchs = re_sfchs[index_pass_both] # # keep values of importance of channels that pass both tests in MatCount # MatCount = MatCount[:, index_pass_both] # # keep the t-values of the channel that pass Welch t-test # df_t_value = pd.DataFrame(list_t_values_passed.reshape(1, list_t_values_passed.shape[-1])[:, index_pass_both], columns=re_sfchs) # print('=' * 50) # print('index: {}'.format(index)) # print('Number of channels that pass the tests with confidence level of {}%: {}'.format(confidence_level*100, index_pass_both.shape[0])) # print('Largest t-value {} is given by {}'.format(max_t_value, channel_max_t_value)) # if index > first_chunk: # ratio_t_test = ((df_t_value[channel_max_t_value] - df_t_value_previous[channel_max_t_value]) / df_t_value_previous[channel_max_t_value]).values[0] # print('Ratio error of the current t-value to the previous of {}: {}'.format(channel_max_t_value, ratio_t_test)) # if abs(ratio_t_test) < tolerance: # print('Terminate the analysis as the ratio error reached the tolerance of {}'.format(tolerance)) # break # break the loop # else: # pass # df_t_value_previous = df_t_value # index += 1 # list_Causal_passed_final = list_Causal_passed[index_pass_both] # list_t_values_passed_final = list_t_values_passed[index_pass_both] # t1 = time.time() # if abs(ratio_t_test) >= tolerance: # if not the error ratio does not reaches the tolerance # print('Terminate the analysis as all the samples have been analyzed') # print('The current ratio error {} does not pass the tolerance of {}'.format(ratio_t_test, tolerance)) # else: # pass # print('Total analysis time taken for finding witness channels: {} s'.format(t1 - t0)) # return re_sfchs, MatCount, list_Causal_passed_final, list_t_values_passed_final, df_target # #
[docs]def WitnessFinder(Listsegments, IFO, re_sfchs_init, sigma, number_process, first_chunk, tolerance, confidence_level, df_null, shuffle='True', PlusHOFT='False', LowerCutOffFreq='None', UpperCutOffFreq='None', freq_bands=Const.freq_bands): ''' description: 1. use a list of glitches 2. analyze glitches of the number of 'first chunk' with all the channels of 're_sfchs_init' 3. perform one-sided binomial test and Welch one-sided t-test 4. reject the channel that do NOT pass the both tests, i.e., can not reject a hypothesis that a channel is consistent with null samples 5. calculate the error ratio of the t-value of the top ranking channel to the previous t-value 6. analyze a next glitch using the channels that pass the both tests 7. add the values of importance to the passed analyzed samples 8. repeat (3)-(7) 9. terminate the process when the error ratio reaches the tolerance USAGE: re_sfchs, MatCount, list_Causal_passed_final, list_t_values_passed_final = WitnessFinder(Listsegments, IFO, re_sfchs_init, sigma, number_process, first_chunk, tolerance, confidence_level, df_null, shuffle=True, PlusHOFT='False', LowerCutOffFreq='None', UpperCutOffFreq='None') :param Listsegments: a list of glitches :param IFO: ifo :param re_sfchs_init: all thes safe channels to be used at the beginning :param sigma: an integer to determine the upper bound of the off-source window :param number_process: number processes of a machine :param first_chunk: the number of samples to be used for the first chunk, where all the channels are to be used :param tolerate: tolerance number to stop the USAGE: List_Count, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq) analysis :param confidence_level: confidence level for one-sided binomial test and Welch one-sided t-test :param df_null: null samples in pandas frame, which is expected to already created :param shuffle: boolen, whether shuffle the list of glitches :param PlusHOFT: boolen, whether analysis hoft :param LowerCutOffFreq: a lower cutoff frequency :param UpperCutOffFreq: an upper cutoff frequency :return: re_sfchs: a list of channels that will have passed the tests untill the tolerance MatCount: a matrix of importance of the channels that passed the test list_Causal_passed_final: a list of witness ratio statistics of the channels that passed the tests list_t_values_passed_final: a list of t-values of the channels that passed the tests ''' t0 = time.time() re_sfchs = re_sfchs_init # get a class object creating plots and table plotter = PlotTableAnalysis() index = 0 # index of a sample to be analyzed, first it is set to zero MatCount = np.array([]) ListGPS = np.array([]) ListDuration = np.array([]) ListSNR = np.array([]) ListConfi = np.array([]) ListID = np.array([]) if shuffle == 'True': rd.shuffle(Listsegments) else: pass for Listsegment in Listsegments: if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband": # multi-frequency band search is chosen re_sfchs_previous = re_sfchs # keep the channels for the later if re_sfchs[0][-3:-1] == "BP": # if the channel names are constructed with frequency-bands # get rid of _BP# and the end of channel names. re_sfchs_queried = np.array([ch.rsplit("_BP", 1)[0] for ch in re_sfchs]) # get the channel name without duplication re_sfchs_queried = np.unique(re_sfchs_queried) # query data and calculate values of importance and etc IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband(Listsegment, IFO, re_sfchs_queried, number_process, PlusHOFT, sigma) else: # at the first time of querying data (at the first iteration, channel has no "BP#" at the end of channel name) # query data and calculate values of importance and etc IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma) # make a matrix of channels with band information added to the end of the channel name, where frequency bands are in rows from top to bottom and channels in columes re_sfchs = np.array([np.core.defchararray.add(re_sfchs, '_BP{}'.format(int(i))) for i in range(len(freq_bands))]) re_sfchs = re_sfchs.flatten(order='F') # list of revised channel names where ChannlA_BP0, ChannlA_BP1, ..., ChannlB_BP0 ListCount = Mat_Count_in_multibands.flatten(order='F') # list of rhos, in the structure of (rho of channel A in band 0, rho of channel A in band 1, ..., rho of channel B in band 0) if re_sfchs_previous[0][-3:-1] == "BP": # if the channel names are constructed with frequency-bands # keep the channels in the bands that pass the test in the previous round keep_index = np.array([np.where(re_sfchs == re_sfch_previous)[0][0] for re_sfch_previous in re_sfchs_previous]) re_sfchs = re_sfchs[keep_index] ListCount = ListCount[keep_index] else: ListCount, channels, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, re_sfchs, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq) if type(ListCount) == int: # if some error occurs, List_rho is 0, go to the next iteration continue else: # if the rho is calculated correctly MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount if index == 0: MatCount = MatCount.reshape(1, MatCount.shape[-1]) ListGPS = np.append(ListGPS, gpstime) ListDuration = np.append(ListDuration, duration) ListSNR = np.append(ListSNR, SNR) ListConfi = np.append(ListConfi, confi) ListID = np.append(ListID, ID) # concatenate a list of GPS times and importance of all the channels MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), ListSNR.reshape(ListSNR.shape[0], 1), ListConfi.reshape(ListConfi.shape[0], 1), MatCount), axis=1) # make a list of column names columnName = np.array(['GPS', 'duration', 'SNR', 'confidence']) columnName = np.append(columnName, re_sfchs) # convert it to a pandas frame df_target = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName) if index >= first_chunk: # the sample size reaches more or equal to first_chunk #------- perform one-sided binomial test and Welch one-sided t-test ------- list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, list_channels = plotter.find_meaing_ful_confidence(df_target, df_null, confidence_level, channels=re_sfchs) _, list_t_values_passed, list_t_values_failed, list_Test, confidence_level_used = plotter.perform_Welch_test(df_target, df_null, confidence_level, channels=re_sfchs) # ----- find the channel name that has the largest t-value max_t_value = list_t_values_passed.max() # largest t-value channel_max_t_value = re_sfchs[list_t_values_passed == max_t_value][0] #------- keep channels which pass both binomial and Welch t-test -------- # find the indices of channels that pass binomial test index_channels_pass_bio = np.where(list_Causal_passed != 0)[0] # find the indices of channels that pass Welch t-test index_channels_pass_welch = np.where(list_t_values_passed != 0)[0] # find the common indices of channels that pass the both tests index_pass_both = np.intersect1d(index_channels_pass_bio, index_channels_pass_welch) # keep the channels that pass the both tests re_sfchs = re_sfchs[index_pass_both] # keep values of importance of channels that pass both tests in MatCount MatCount = MatCount[:, index_pass_both] # keep the t-values of the channel that pass Welch t-test df_t_value = pd.DataFrame(list_t_values_passed.reshape(1, list_t_values_passed.shape[-1])[:, index_pass_both], columns=re_sfchs) print('=' * 50) print('index: {}'.format(index)) print('Number of channels that pass the tests with confidence level of {}%: {}'.format(confidence_level*100, index_pass_both.shape[0])) print('Largest t-value {} is given by {}'.format(max_t_value, channel_max_t_value)) if index > first_chunk: ratio_t_test = ((df_t_value[channel_max_t_value] - df_t_value_previous[channel_max_t_value]) / df_t_value_previous[channel_max_t_value]).values[0] print('Ratio error of the current t-value to the previous of {}: {}'.format(channel_max_t_value, ratio_t_test)) if abs(ratio_t_test) < tolerance: print('Terminate the analysis as the ratio error reached the tolerance of {}'.format(tolerance)) break # break the loop else: pass df_t_value_previous = df_t_value index += 1 list_Causal_passed_final = list_Causal_passed[index_pass_both] list_t_values_passed_final = list_t_values_passed[index_pass_both] t1 = time.time() if abs(ratio_t_test) >= tolerance: # if not the error ratio does not reaches the tolerance print('Terminate the analysis as all the samples have been analyzed') print('The current ratio error {} does not pass the tolerance of {}'.format(ratio_t_test, tolerance)) else: pass print('Total analysis time taken for finding witness channels: {} s'.format(t1 - t0)) return re_sfchs, MatCount, list_Causal_passed_final, list_t_values_passed_final, df_target
[docs]def BackgroundCut(df_null, channel, background_upper_cut): ''' description: calculate the upper cut of channels using the FAP distribution USAGE: cut = BackgroundCut(df_null, channel, background_upper_cut) :param df_null: null samples in pandas frame :param channel: a list of channels :param background_upper_cut: confidence level of the uppercut of null samples of witness channel(s), e.g., 1sigma = 0.68268, 2sigma = 0.95449, 3sigma = 0.997300204, 4sigma = 0.99993666, and 5simga = 99.9999426 :return: cut: an uuper cut of the null sample of those channels ''' count, edges = np.histogram(df_null[channel], bins=500, range=(0, 1)) delta_edge = edges[1] - edges[0] edg_cent = edges[:-1] + delta_edge/2. totalN = float(df_null.shape[0]) frac = count / totalN # fraction of samples FAP = np.cumsum(frac[::-1])[::-1] # cumulative number of fraction of samples in ascending order from the right #percentile = FAP[FAP < 1-background_upper_cut][0] percentile = FAP[FAP > 1 - background_upper_cut][-1] index = np.where(FAP == percentile)[0][0] cut = edg_cent[index] return cut
[docs]def FlagFinder(Epoch_lt, Listsegments, IFO, channels, list_statistics, num_high_rank_channels_to_be_used, df_null, background_upper_cut, number_process, sigma, PlusHOFT, LowerCutOffFreq, UpperCutOffFreq, freq_bands=Const.freq_bands): ''' description: 1. select high ranking witness channels 2. determine the upper cut of the null samples for those high ranking witness channels 3. analyze all the glitches using the selected witness channels 4. make a flag when those channels give importance above the upper cut of the null (flags are made if ALL the chosen witness have value of importance above the upper cut of the null samples) 5. calculate efficiency and deadtime USAGE: efficiency, deadtime_frac, df = FlagFinder(Epoch_lt, Listsegments, IFO, channels, list_statistics, num_high_rank_channels_to_be_used, df_null, background_upper_cut, number_process, sigma, PlusHOFT, LowerCutOffFreq, UpperCutOffFreq) :param Epoch_lt: a list of an epoch :param Listsegments: a list of glitches :param IFO: ifo :param channels: a list of channels, which are expected to be witness channels :param list_statistics: a list of ranking statistics, either witness ratio statistics or t-value :param num_high_rank_channels_to_be_used: number of high ranking channels to be used for making flag :param df_null: null samples in pandas frame :param background_upper_cut: confidence level of the uppercut of null samples of witness channel(s), e.g., 1sigma = 0.68268, 2sigma = 0.95449, 3sigma = 0.997300204, 4sigma = 0.99993666, and 5simga = 99.9999426 :param number_process: number of processors :param sigma: an integer to determine the upper bound of the off-source window :param PlusHOFT: boolen, whether analyze hoft :param LowerCutOffFreq: a lower cutoff frequency :param UpperCutOffFreq: an upper cutoff frequency :return: efficiency: a ratio of glitches that are flagged to total glitches analyzed without an issue of no data available deadtime_frac: a ratio of total on-source windows to the total analysis time df: a matrix of GPS, duration, SNR, confidence level of classification of glitches, flag and importance in pandas frame ''' t0 = time.time() nummber_channels = channels.shape[0] if num_high_rank_channels_to_be_used > nummber_channels: num_high_rank_channels_to_be_used = nummber_channels else: pass #----- find the high ranking channels # sort the channels based the statistics print(channels) Ranked_channels = [channels[i] for i in np.argsort(list_statistics)[::-1]] # sort the statistics Ranked_statistics = [list_statistics[i] for i in np.argsort(list_statistics)[::-1]] # take the high ranking channels Ranked_channels_to_be_used = Ranked_channels[:num_high_rank_channels_to_be_used] # determine the lower cut of the high ranking channels print('Witness channel(s) used for flag') print('{}% confidence level for the upper cut of the background distribution'.format(background_upper_cut*100)) cuts = np.array([]) for Ranked_channel in Ranked_channels_to_be_used: cut = BackgroundCut(df_null, Ranked_channel, background_upper_cut) cuts = np.append(cuts, cut) print('{} with background cut of {}'.format(Ranked_channel, cut)) #---- analyze glitches ------------ index = 1 MatCount = np.array([]) ListGPS = np.array([]) ListDuration = np.array([]) ListSNR = np.array([]) ListConfi = np.array([]) ListID = np.array([]) ListFlag = np.array([]) deadtime = 0 veto = 0 if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband": # multi-frequency band search is chosen # get rid of _BP# and the end of channel names. Ranked_channels_to_be_used_queried = np.array([ch.rsplit("_BP", 1)[0] for ch in Ranked_channels_to_be_used]) # get the channel name without duplication Ranked_channels_to_be_used_queried = np.unique(Ranked_channels_to_be_used_queried) for Listsegment in Listsegments: if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband": # multi-frequency band search is chosen IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband(Listsegment, IFO, Ranked_channels_to_be_used_queried, number_process, PlusHOFT, sigma) # make a matrix of channels with band information added to the end of the channel name, where frequency bands are in rows from top to bottom and channels in columes re_sfchs = np.array([np.core.defchararray.add(re_sfchs, '_BP{}'.format(int(i))) for i in range(len(freq_bands))]) re_sfchs = re_sfchs.flatten(order='F') # list of revised channel names where ChannlA_BP0, ChannlA_BP1, ..., ChannlB_BP0 ListCount = Mat_Count_in_multibands.flatten(order='F') # list of rhos, in the structure of (rho of channel A in band 0, rho of channel A in band 1, ..., rho of channel B in band 0) # keep the channels in the bands keep_index = np.array([np.where(re_sfchs == Ranked_channel_to_be_used)[0][0] for Ranked_channel_to_be_used in Ranked_channels_to_be_used]) ListCount = ListCount[keep_index] else: ListCount, channels, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, Ranked_channels_to_be_used, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq) if type(ListCount) == int: # if some error occurs, List_rho is None, go to the next iteration continue else: # if the rho is calculated correctly MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount #if MatCount.shape[-1] == 1: if index == 1: MatCount = MatCount.reshape(1, MatCount.shape[-1]) ListGPS = np.append(ListGPS, gpstime) ListDuration = np.append(ListDuration, duration) ListSNR = np.append(ListSNR, SNR) ListConfi = np.append(ListConfi, confi) ListID = np.append(ListID, ID) index += 1 if (ListCount > cuts).all() == True: # if all the high ranking channel have values of importance above cuts print('Index {}: A glitch at GPS = {} is flagged'.format(index, gpstime)) veto += 1 deadtime += duration ListFlag = np.append(ListFlag, 'Y') else: print('Index {}: A glitch at GPS = {} is NOT flagged'.format(index, gpstime)) ListFlag = np.append(ListFlag, 'N') efficiency = veto / float(index) deadtime_frac = deadtime / float(Epoch_lt[-1][-1] - Epoch_lt[0][0]) # concatenate a list of GPS times and importance of all the channels MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), ListSNR.reshape(ListSNR.shape[0], 1), ListConfi.reshape(ListConfi.shape[0], 1), ListFlag.reshape(ListFlag.shape[0], 1), MatCount), axis=1) # make a list of column names columnName = np.array(['GPS', 'duration', 'SNR', 'confidence', 'flag']) columnName = np.append(columnName, Ranked_channels_to_be_used) # convert it to a pandas frame df = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName) t1 = time.time() print('Analysis time taken: {} s'.format(t1 - t0)) return efficiency, deadtime_frac, df
[docs]def FlagFinder_all_witnesses(Proportion_Duration_Bfr_Centr, Listsegments, IFO, channels, list_statistics, num_high_rank_channels_to_be_used, df_null, background_upper_cut, number_process, sigma, PlusHOFT, LowerCutOffFreq, UpperCutOffFreq, freq_bands=Const.freq_bands): ''' description: 1. select high ranking witness channels 2. determine the upper cut of the null samples for those high ranking witness channels 3. analyze all the glitches using the selected witness channels 4. make flags when those channels give importance above the upper cut of the null (flags are make for individual channels) USAGE: df_flag = FlagFinder(Proportion_Duration_Bfr_Centr, Listsegments, IFO, channels, list_statistics, num_high_rank_channels_to_be_used, df_null, background_upper_cut, number_process, sigma, PlusHOFT, LowerCutOffFreq, UpperCutOffFreq) :param Proportion_Duration_Bfr_Centr: a fraction of the on-source window before the peak GPS time :param Listsegments: a list of glitches :param IFO: ifo :param channels: a list of channels, which are expected to be witness channels :param list_statistics: a list of ranking statistics, either witness ratio statistics or t-value :param num_high_rank_channels_to_be_used: number of high ranking channels to be used for making flag :param df_null: null samples in pandas frame :param background_upper_cut: confidence level of the uppercut of null samples of witness channel(s), e.g., 1sigma = 0.68268, 2sigma = 0.95449, 3sigma = 0.997300204, 4sigma = 0.99993666, and 5simga = 99.9999426 :param number_process: number of processors :param sigma: an integer to determine the upper bound of the off-source window :param PlusHOFT: boolen, whether analyze hoft :param LowerCutOffFreq: a lower cutoff frequency :param UpperCutOffFreq: an upper cutoff frequency :return: df_flag: a matrix of GPS, duration, SNR, confidence level of classification of glitches, importance of the witness channels, the flags of the witness channels in pandas frame ''' t0 = time.time() nummber_channels = channels.shape[0] if num_high_rank_channels_to_be_used > nummber_channels: num_high_rank_channels_to_be_used = nummber_channels else: pass #----- find the high ranking channels # sort the channels based the statistics Ranked_channels = [channels[i] for i in np.argsort(list_statistics)[::-1]] print(Ranked_channels) # sort the statistics Ranked_statistics = [list_statistics[i] for i in np.argsort(list_statistics)[::-1]] # take the high ranking channels Ranked_channels_to_be_used = Ranked_channels[:num_high_rank_channels_to_be_used] # determine the lower cut of the high ranking channels print('Witness channel(s) used for flag') print('{}% confidence level for the upper cut of the background distribution'.format(background_upper_cut*100)) cuts = np.array([]) for Ranked_channel in Ranked_channels_to_be_used: cut = BackgroundCut(df_null, Ranked_channel, background_upper_cut) cuts = np.append(cuts, cut) print('{} with background cut of {}'.format(Ranked_channel, cut)) #---- analyze glitches ------------ index = 1 MatCount = np.array([]) ListGPS = np.array([]) ListDuration = np.array([]) ListSNR = np.array([]) ListConfi = np.array([]) ListID = np.array([]) #ListFlag = np.array([]) MatFlag = {} for channel in Ranked_channels_to_be_used: MatFlag['{}_flag'.format(channel)] = [] if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband": # multi-frequency band search is chosen # get rid of _BP# and the end of channel names. Ranked_channels_to_be_used_queried = np.array([ch.rsplit("_BP", 1)[0] for ch in Ranked_channels_to_be_used]) # get the channel name without duplication Ranked_channels_to_be_used_queried = np.unique(Ranked_channels_to_be_used_queried) for Listsegment in Listsegments: if LowerCutOffFreq == "Multiband" and UpperCutOffFreq == "Multiband": # multi-frequency band search is chosen IndexSatisfied, Mat_Count_in_multibands, list_sample_rates, re_sfchs, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho_multband(Listsegment, IFO, Ranked_channels_to_be_used_queried, number_process, PlusHOFT, sigma) # make a matrix of channels with band information added to the end of the channel name, where frequency bands are in rows from top to bottom and channels in columes re_sfchs = np.array([np.core.defchararray.add(re_sfchs, '_BP{}'.format(int(i))) for i in range(len(freq_bands))]) re_sfchs = re_sfchs.flatten(order='F') # list of revised channel names where ChannlA_BP0, ChannlA_BP1, ..., ChannlB_BP0 ListCount = Mat_Count_in_multibands.flatten(order='F') # list of rhos, in the structure of (rho of channel A in band 0, rho of channel A in band 1, ..., rho of channel B in band 0) # keep the channels in the bands keep_index = np.array([np.where(re_sfchs == Ranked_channel_to_be_used)[0][0] for Ranked_channel_to_be_used in Ranked_channels_to_be_used]) ListCount = ListCount[keep_index] else: ListCount, channels, gpstime, duration, SNR, confi, ID = CreateAllChannels_rho(Listsegment, IFO, Ranked_channels_to_be_used, number_process, PlusHOFT, sigma, LowerCutOffFreq, UpperCutOffFreq) if type(ListCount) == int: # if some error occurs, List_rho is None, go to the next iteration continue else: # if the rho is calculated correctly MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount #if MatCount.shape[-1] == 1: if index == 1: MatCount = MatCount.reshape(1, MatCount.shape[-1]) ListGPS = np.append(ListGPS, gpstime) ListDuration = np.append(ListDuration, duration) ListSNR = np.append(ListSNR, SNR) ListConfi = np.append(ListConfi, confi) ListID = np.append(ListID, ID) index += 1 for i in range(len(ListCount)): if ListCount[i] > cuts[i]: MatFlag['{}_flag'.format(Ranked_channels_to_be_used[i])].append('Y') else: MatFlag['{}_flag'.format(Ranked_channels_to_be_used[i])].append('N') # concatenate a list of GPS times and importance of all the channels MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), ListSNR.reshape(ListSNR.shape[0], 1), ListConfi.reshape(ListConfi.shape[0], 1), MatCount), axis=1) # make a list of column names columnName = np.array(['GPStime', 'duration', 'SNR', 'confidence']) columnName = np.append(columnName, Ranked_channels_to_be_used) # convert it to a pandas frame df = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName) df_flag = pd.DataFrame(MatFlag) df_flag = pd.concat([df, df_flag], axis=1) df_flag = df_flag.assign(start_time=(df_flag["GPStime"].astype("float") - df_flag.duration.astype("float") * Proportion_Duration_Bfr_Centr).round(3)) df_flag = df_flag.assign(end_time=(df_flag["GPStime"].astype("float") + df_flag.duration.astype("float") * (1 - Proportion_Duration_Bfr_Centr)).round(3)) t1 = time.time() print('Analysis time taken: {} s'.format(t1 - t0)) return df_flag
[docs]def make_veto_omicron_in_aux(epoch_start, epoch_end, IFO, channel, df_foreground, glitch_type, Proportion_Duration_Bfr_Centr, SNR_thresh, df_flag, OutputHDF5_dir, ifostate, N_processes): ''' description: 1. query omicron triggers (aux) of a witness channel 2. find the aux omicron triggers which are coincident with the glitches that are analyzed 3. find the aux SNR cut which corresponds to the importance cut of this witness channel 4. find the aux omicron triggers which are coincident with all the glitches with label being studied 5. veto glitches when the coincident aux triggers have SNR above the aux SNR cut (given in the step 3) USAGE: rho_cut, snr_cut, deadtime, efficiency, efficiency_over_deadtime, df_target = make_veto_omicron_in_aux(epoch_start, epoch_end, IFO, channel, df_foreground, glitch_type, Proportion_Duration_Bfr_Centr, SNR_thresh, df_flag, OutputHDF5_dir, ifostate, N_processes) :param epoch_start: start time of the analysis period :param epoch_end: end time of the analysis period :param IFO: ifo {'H1' or 'L1'} :param channel: a witness channel name, it could be a channel in a particular frequency band :param df_foreground: a pandas data frame of all the glitches in the strain channel fed into pychChoo :param glitch_type: a glitch type that is focused on :param Proportion_Duration_Bfr_Centr: a fraction the on-source window before the peak GPS time of a glitch :param SNR_thresh: a lower SNR threshold to select glitches that are studied :param df_flag: a pandas data frame of flagged (including 'Y' and 'N') of the witness channels for the glitches that have been analyzed with FlagFinder_all_witnesses() :param OutputHDF5_dir: a output directory where the omicron trigger of a witness channel :param ifostate: state of an ifo :param N_processes: number of cores :return: rho_cut: lower cut of importance of a witness channel snr_cut: corresponding SNR cut of this witness channel deadtime: a fraction of the time that are vetoed during the analysis time efficiency: a fraction of glitches that are vetoed efficiency_over_deadtime: ratio of efficiency over deadtime df_target: a pandas data frame of this witness, where the glitches that are vetoed are marked as 'Y' ''' t0 = time.time() #---------------------------------------------------+ # get the omicron trigger of the witness channel | #---------------------------------------------------+ indentify = IdentifyGlitch() if channel[-3:-1] == "BP": # if multi-band search is selected channel_original = channel.rsplit("_BP", 1)[0] else: channel_original = channel # get the state vector ListEpochs, trigger_dir = indentify.FindObservingTimeSegments(IFO, epoch_start, epoch_end, OutputHDF5_dir, state=ifostate) # get the omicron triggers of the witness channel indentify.GetTriggerMetadata(ListEpochs, IFO, trigger_dir, N_processes, trigger_pipeline='omicron', channel=channel_original) # get the omicron triggers path_metadata = os.path.join(trigger_dir, 'TriggerMetadata.csv') # load a meta data matrix df_aux = pd.read_csv(path_metadata) #-------------------------------------------------------------------+ # all the glitches in h(t) within the epoch and the label chosen | #-------------------------------------------------------------------+ # all the glitches during the epoch df_foreground = df_foreground[(epoch_start <= df_foreground["GPStime"]) & (df_foreground["GPStime"] <= epoch_end)] # all the glitches within the epoch # all the glitches during the epoch and the label is the one chosen df_target = df_foreground[(df_foreground["label"] == glitch_type) & (df_foreground["snr"] > SNR_thresh)] # all the target samples # assgin the start and end time of the on-source windows df_target = df_target.assign(start_time=df_target["GPStime"] - Proportion_Duration_Bfr_Centr * df_target["duration"]) df_target = df_target.assign(end_time=df_target["GPStime"] + (1 - Proportion_Duration_Bfr_Centr) * df_target["duration"]) # drop duplicates df_flag = df_flag.drop_duplicates(subset=["GPStime", "duration"]).reset_index() df_target = df_target.drop_duplicates(subset=["GPStime", "duration"]).reset_index() #---------------------------------------------------+ # mark triggers coincident with the flagged samples | #---------------------------------------------------+ # mark triggers coincident with the flagged samples df_aux = df_aux.assign(flag_index=['None'] * df_aux.shape[0]) df_aux = df_aux.assign(flag_rep_index=['None'] * df_aux.shape[0]) for flag_index in range(df_flag.shape[0]): # iterate the all the flagged samples (subset samples ) # start, end, and the peak GPS timeos of the flagged samples flag_start, flag_end, flag_center_GPS = df_flag.iloc[flag_index]["start_time"], df_flag.iloc[flag_index]["end_time"], df_flag.iloc[flag_index]["GPStime"] # mark the omicron trigggers of the witness channel that are within the flagged segment df_aux.loc[(flag_start <= df_aux["GPStime"]) & (df_aux["GPStime"] < flag_end), 'flag_index'] = flag_index # get the triggers within the on-source df_within = df_aux[(flag_start <= df_aux["GPStime"]) & (df_aux["GPStime"] < flag_end)] try: # values of SNR devided the difference of the trigger times (aux) and the peak time of the trigger (hoft) SNR_over_diff_abs = df_within["snr"] / abs(df_within["GPStime"] - flag_center_GPS) # the representative glitch is chosen such as to maximize the SNR / time difference index_maxima = SNR_over_diff_abs.index[SNR_over_diff_abs.argmax()] # mark the trigger as that flag index df_aux.at[index_maxima, "flag_rep_index"] = flag_index except ValueError: # if no triggers of the witness channel in the on-source is not found pass #print(flag_center_GPS, flag_end - flag_start) #print(flag_index) #----------------------------------------------------+ # mark triggers coincident with the target samples | #----------------------------------------------------+ df_aux = df_aux.assign(target_index=['None'] * df_aux.shape[0]) df_aux = df_aux.assign(target_rep_index=['None'] * df_aux.shape[0]) for target_index in range(df_target.shape[0]): # iterate the target glitch samples # the start, end and the peak GPS time of the glitches (hoft) target_start, target_end, target_center_GPS = df_target.iloc[target_index]["start_time"], df_target.iloc[target_index]["end_time"], df_target.iloc[target_index]["GPStime"] # mark the omicron trigggers of the witness channel that are within the on-source window df_aux.loc[(target_start <= df_aux["GPStime"]) & (df_aux["GPStime"] < target_end), 'target_index'] = target_index # get the triggers within the on-source df_within = df_aux[(target_start <= df_aux["GPStime"]) & (df_aux["GPStime"] < target_end)] try: # values of SNR devided the difference of the trigger times (aux) and the peak time of the trigger (hoft) SNR_over_diff_abs = df_within["snr"] / abs(df_within["GPStime"] - target_center_GPS) # the representative glitch is chosen such as to maximize the SNR / time difference index_maxima = SNR_over_diff_abs.index[SNR_over_diff_abs.argmax()] # mark the trigger as that target index df_aux.at[index_maxima, "target_rep_index"] = target_index except ValueError: # if no triggers of the witness channel in the on-source is not found pass #print(target_index) #-------------------------------------------------------------------+ # find the SNR (aux) cut, which corresponds to the importance cut | #-------------------------------------------------------------------+ rho_cut = df_flag[df_flag["{}_flag".format(channel)] == "Y"][channel].min() # the smallest value of importance of the glitches that are flagged in the witness channel rho_cut_index = df_flag.loc[df_flag[channel] == rho_cut].index[0] # corresponding the index of minimum SNR of the witness channel that are flagged snr_cut = df_aux[df_aux["flag_rep_index"] == rho_cut_index]["snr"].values[0] # corresponding the index of minimum SNR of the witness channel that are flagged #-------------------------------------+ # find the efficiency and dead time | #-------------------------------------+ # find the indicies of the target samples where the triggers in the witness channel are coincident with the glitches in the hoft and the SNR_aux is above the SNR cut defined above target_vetoed_index = df_aux[(df_aux["target_rep_index"] != 'None') & (df_aux["snr"] >= snr_cut)]["target_rep_index"].values # calculate the dead time deadtime = np.sum(df_target.iloc[target_vetoed_index]['end_time'] - df_target.iloc[target_vetoed_index]['start_time']) / (epoch_end - epoch_start) efficiency = target_vetoed_index.shape[0] / float(df_target.shape[0]) efficiency_over_deadtime = efficiency / deadtime # make a new column indicatign the veto df_target = df_target.assign(veto=['N']*df_target.shape[0]) df_target.loc[target_vetoed_index, 'veto'] = 'Y' t1 = time.time() print('Analysis time taken: {} s'.format(t1 - t0)) return rho_cut, snr_cut, deadtime, efficiency, efficiency_over_deadtime, df_target
# # # import pandas as pd # import numpy as np # import matplotlib # matplotlib.use("Agg") # import matplotlib.pyplot as plt # # # # # list of glitches that are analyzed (subset) # # list of all the glitches # # list of the omicron triggers of the witness channel # indentify = IdentifyGlitch() # ifostate = 'observing' # # get the state vector # ListEpochs, trigger_dir = indentify.FindObservingTimeSegments(ifo, startT, endT, OutputHDF5_dir, state=ifostate) # # get the omicron triggers # indentify.GetTriggerMetadata(ListEpochs, ifo, trigger_dir, N_processes, trigger_pipeline='omicron', channel='LSC-POP_A_LF_OUT_DQ') # # get the omicron triggers # path_metadata = os.path.join(trigger_dir, 'TriggerMetadata.csv') # # # path_metadata = '/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/O3/H1/for_ryan/1248652818_1249862418/TriggerMetadata.csv' # # load a meta data matrix # df = pd.read_csv(path_metadata) # # epoch_start, epoch_end = 1248652818, 1249862418 # Proportion_Duration_Bfr_Centr = 0.5 # df_foreground = pd.read_csv("/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/GravityspyDataSet/O3/for_ryan/gspy.csv") # df_foreground = df_foreground[(epoch_start <= df_foreground["GPStime"]) & (df_foreground["GPStime"] <= epoch_end)] # all the glitches within the epoch # df_flag = pd.read_csv("/home/kentaro.mogushi/public_html/longlive/Plots/GravitySpy/O3/H1/for_ryan/H1/LSC-POP_A_LF_OUT_DQ_BP0_flag.csv") # selected target samples that are flagged # df_target = df_foreground[(df_foreground["label"] == 'Extremely_Loud') & (df_foreground["snr"] > 7.5)] # all the target samples # df_target = df_target.assign(start_time=df_target["GPStime"] - Proportion_Duration_Bfr_Centr * df_target["duration"]) # df_target = df_target.assign(end_time=df_target["GPStime"] + (1 - Proportion_Duration_Bfr_Centr) * df_target["duration"]) # # drop duplicates # df_flag = df_flag.drop_duplicates(subset=["GPS", "duration"]).reset_index() # df_target = df_target.drop_duplicates(subset=["GPStime", "duration"]).reset_index() # # # # # # # mark triggers coincident with the flagged samples # df = df.assign(flag_index=['None']*df.shape[0]) # df = df.assign(flag_rep_index=['None']*df.shape[0]) # list_df_trigger_within_flag = [] # for flag_index in range(df_flag.shape[0]): # flag_start, flag_end, flag_center_GPS = df_flag.iloc[flag_index]["start_time"], df_flag.iloc[flag_index]["end_time"], df_flag.iloc[flag_index]["GPS"] # df.loc[(flag_start <= df["GPStime"]) & (df["GPStime"] < flag_end), 'flag_index'] = flag_index # df_within = df[(flag_start <= df["GPStime"]) & (df["GPStime"] < flag_end)] # try: # SNR_over_diff_abs = df_within["snr"] / abs(df_within["GPStime"] - flag_center_GPS) # index_maxima = SNR_over_diff_abs.index[SNR_over_diff_abs.argmax()] # df.at[index_maxima, "flag_rep_index"] = flag_index # except ValueError: # print(flag_center_GPS, flag_end - flag_start) # print(flag_index) # # # # mark triggers coincident with the target samples # df = df.assign(target_index=['None']*df.shape[0]) # df = df.assign(target_rep_index=['None']*df.shape[0]) # for target_index in range(df_target.shape[0]): # target_start, target_end, target_center_GPS = df_target.iloc[target_index]["start_time"], df_target.iloc[target_index]["end_time"], df_target.iloc[target_index]["GPStime"] # df.loc[(target_start <= df["GPStime"]) & (df["GPStime"] < target_end), 'target_index'] = target_index # df_within = df[(target_start <= df["GPStime"]) & (df["GPStime"] < target_end)] # try: # SNR_over_diff_abs = df_within["snr"] / abs(df_within["GPStime"] - target_center_GPS) # index_maxima = SNR_over_diff_abs.index[SNR_over_diff_abs.argmax()] # df.at[index_maxima, "target_rep_index"] = target_index # except ValueError: # print(target_index) # # # # # # snr histogram # plt.hist(np.log10(df[df["flag_index"] != 'None']["snr"]), bins=50, density=True, label='flagged', histtype='step') # plt.hist(np.log10(df[df["target_index"] != 'None']["snr"]), bins=50, density=True, label='target', histtype='step', linestyle='--') # plt.legend() # plt.ylabel("Normalized count") # plt.xlabel("log 10 (SNR) of omicron trigger in hoft") # plt.savefig("Hist_snr.png") # plt.close() # # #------------------- # # diff GPS and SNR # #------------------- # list_diff_time = np.array([]) # list_snr = np.array([]) # for flag_index in df[df["flag_index"] != 'None']["flag_index"].unique(): # trig_GPStimes = df[df["flag_index"] == flag_index]["GPStime"].values # SNRs = df[df["flag_index"]==flag_index]["snr"].values # flag_GPStime = df_flag.iloc[flag_index]["GPS"] # diff_GPStime = trig_GPStimes - flag_GPStime # list_diff_time = np.append(list_diff_time, diff_GPStime) # list_snr = np.append(list_snr, SNRs) # # # #------------------- # # diff GPS and SNR (maxma) # #------------------- # list_diff_time_rep = np.array([]) # list_snr_rep = np.array([]) # for flag_index in df[df["flag_index"] != 'None']["flag_index"].unique(): # trig_GPStimes = df[df["flag_index"] == flag_index]["GPStime"].values # SNRs = df[df["flag_index"]==flag_index]["snr"].values # flag_GPStime = df_flag.iloc[flag_index]["GPS"] # if SNR.shape[0] == 0: # print(flag_index) # # diff_GPStime = trig_GPStimes - flag_GPStime # SNR_over_diff_abs = SNRs / np.abs(diff_GPStime) # max_index = np.argmax(SNR_over_diff_abs) # SNR_rep = SNRs[max_index] # diff_GPStime_rep = diff_GPStime[max_index] # list_diff_time_rep = np.append(list_diff_time_rep, diff_GPStime_rep) # list_snr_rep = np.append(list_snr_rep, SNR_rep) # # plt.scatter(list_diff_time, list_snr, s=3, label='all triggers within flags') # plt.scatter(list_diff_time_rep, list_snr_rep, s=3, label='representaitive', marker='*', alpha=0.5) # plt.axvline(0, linestyle='--', color='black') # plt.xlabel("GPS time (aux trigger) - GPS time (hoft)") # plt.ylabel('SNR') # plt.yscale("log") # plt.legend() # plt.savefig("dist_diff_GPS_VS_snr.png") # plt.close() # # list_imp_flag_yes = np.array([]) # list_snr_flag_yes = np.array([]) # list_imp_flag_no = np.array([]) # list_snr_flag_no = np.array([]) # for flag_index in range(df_flag.shape[0]): # imp = df_flag.iloc[flag_index]["LSC-POP_A_LF_OUT_DQ_BP0"] # if df_flag.iloc[flag_index]["flag"] == 'Y': # snr = df[df["flag_rep_index"] == flag_index]["snr"].values # try: # snr = snr[0] # except IndexError: # snr = 0 # print(flag_index, df_flag.iloc[flag_index]["GPS"], df_flag.iloc[flag_index]["duration"]) # list_imp_flag_yes = np.append(list_imp_flag_yes, imp) # list_snr_flag_yes = np.append(list_snr_flag_yes, snr) # else: # list_imp_flag_no = np.append(list_imp_flag_no, imp) # snr = df[df["flag_rep_index"] == flag_index]["snr"].values # try: # snr = snr[0] # except IndexError: # snr = 0 # list_snr_flag_no = np.append(list_snr_flag_no, snr) # # # # plt.scatter(list_imp_flag_no[list_snr_flag_no!=0], list_snr_flag_no[list_snr_flag_no!=0], s=5, label='Not flagged', marker='o', alpha=0.5, color='grey') # plt.scatter(list_imp_flag_yes[list_snr_flag_yes!=0], list_snr_flag_yes[list_snr_flag_yes!=0], s=5, label='flagged', marker='*') # #plt.scatter(list_imp_flag_no[list_snr_flag_no==0], list_snr_flag_no[list_snr_flag_no==0], s=10, label='Not flagged, no trig ', marker='X', alpha=0.5, color='magenta') # #plt.scatter(list_imp_flag_yes[list_snr_flag_yes==0], list_snr_flag_yes[list_snr_flag_yes==0], s=10, label='flagged, no trig', marker='X' ) # plt.legend() # plt.yscale("log") # plt.title("Witness LSC-POP_A_LF_OUT_DQ_BP0") # plt.ylabel('SNR (aux)') # plt.xlabel("Importance") # plt.savefig("dist_imp_VS_snr.png") # plt.close() # # # # # veto the target samples # rho_cut = list_imp_flag_yes[np.argmin(list_imp_flag_yes)] # snr_cut = list_snr_flag_yes[np.argmin(list_imp_flag_yes)] # target_vetoed_index = df[(df["target_rep_index"] != 'None') & (df["snr"] >= snr_cut)]["target_rep_index"].values # target_not_vetoed_index = df[(df["target_rep_index"] != 'None') & (df["snr"] < snr_cut)]["target_rep_index"].values # deadtime = np.sum(df_target.iloc[target_vetoed_index]['end_time'] - df_target.iloc[target_vetoed_index]['start_time']) / (epoch_end - epoch_start) # efficiency = target_vetoed_index.shape[0] / float(df_target.shape[0]) # efficiency_over_deadtime = efficiency / deadtime # # # plt.scatter(df_target["GPStime"], df_target["snr"], label='all') # plt.scatter(df_target.iloc[target_vetoed_index]["GPStime"], df_target.iloc[target_vetoed_index]["snr"], label='whole') # plt.scatter(df_target.iloc[target_not_vetoed_index]["GPStime"], df_target.iloc[target_not_vetoed_index]["snr"], label='not vetoed (coinc)') # plt.axhline(snr_cut, color='black', linestyle='--') # plt.xlabel("GPS time") # plt.title('deadtime = {:.2e}, efficiency = {:.2f}, eff /dead = {:.2f} \n , LSC-POP_A_LF_OUT_DQ_BP0'.format(deadtime, efficiency, efficiency_over_deadtime)) # plt.ylabel('SNR (hoft)') # plt.yscale('log') # plt.legend() # plt.savefig("target_sample_before_selection.png") # plt.close() #