Source code for origli.utilities.utilities

#!/usr/bin/env python

"""

 Script name: utilities.py

 Description:
    File containing utilities




"""
try:
    import cv2
except:
    pass
import json
import numpy as np 
import pandas as pd
import os
import matplotlib
from sklearn import mixture
from sklearn.decomposition import PCA
import h5py
import time
import random as rd
import xml.etree.ElementTree as ET # for XML file
import sys
from math import floor
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
import concurrent.futures
import itertools as itrtls
from math import factorial
from scipy.stats import ttest_ind, binom, binom_test, chi2, beta
from sklearn.utils import shuffle as sklearn_shuffle
import scipy.integrate as integrate




try:
   from gwpy.timeseries import TimeSeries, TimeSeriesDict
   from gwpy.segments import DataQualityFlag
   from gwpy.table import EventTable
   from gwpy.table.io.pycbc import filter_empty_files as filter_pycbc_live_files
   from gwpy.time import tconvert
except ImportError:
   print('There is not gwpy module yet')
try:
    from gwtrigfind import find_trigger_files
except ModuleNotFoundError:
    raise ValueError("gwtrigfind has not been installed yet. please do pip install gwtrigfind")

matplotlib.use("Agg")
# matplotlib.rcParams["backend"] = "TkAgg"
# Test display
# try:
#     matplotlib.rcParams["backend"] = "TkAgg"
#     m_dis = os.environ["DISPLAY"]
# except KeyError:
#     print("Display problems. Set to Agg")
#     matplotlib.use('Agg')
#
import matplotlib.pyplot as plt
plt.rc('text', usetex=False) # this is used for fixing a gwpy compatible issue
from matplotlib.colors import LogNorm
from matplotlib.ticker import MaxNLocator

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

import origli.utilities.const as Const # for multi frequency bands defined in


__author__ = "Kentaro Mogushi"
__copyright__ = "Copyright 2019-2025, Kentaro Mogushi"
__credits__ = ["Line for credits"]
__license__ = "GPL"
__version__ = "0.1.0"
__maintainer__ = "K. Mogushi"
__email__ = "kmhhz@umsystem.edu"
__status__ = "Developing"


# def convert_imageset_to_pandas(m_indexfile):
#     """
#
#     :param m_indexfile:
#     :return:
#     """
#
#     df_in = pd.read_csv(m_indexfile)
#     label_list=[]
#     image_list=None
#
#     for ii in range(len(df_in)):
#         m_img_array,h,w,d = image_to_1d(df_in.iloc[ii]["id"])
#
#         m_img_array = np.insert(m_img_array,0,d,axis=1)
#         m_img_array = np.insert(m_img_array,0,w,axis=1)
#         m_img_array = np.insert(m_img_array,0,h,axis=1)
#
#         try:
#             image_list = np.concatenate((image_list,m_img_array),axis=0)
#         except ValueError:
#             image_list = m_img_array
#         label_list.append(df_in.loc[ii]["label"])
#
#
#     label_list = np.asarray(label_list)
#
#     #build the labeling
#     col_names =["h","w","d"]
#     for hi in range(h):
#         for wi in range(w):
#             for di in range(d):
#                 col_names.append("px-h%s-w%s-d%s" % (hi,wi,di))
#
#
#     image_df = pd.DataFrame(data=image_list,columns=col_names)
#     label_df = pd.DataFrame(data=label_list,columns=["label"])
#
#     out_df = label_df.join(image_df)
#
#     if len(out_df)!=len(df_in):
#         raise RuntimeError("Input and output numer of images does not match!")
#
#     #if m_output_name is not None:
#     #    out_df.to_csv(m_output_name)#,columns=["label",col_names])
#     #    out_df.to_hdf(m_output_name,key="df", mode='w', complevel=9, complib="bzip2")
#     return out_df
#
# def image_to_1d(m_input_image_filename):
#     """
#
#     :param m_input_image:
#     :return:
#     """
#
#     m_input_image = cv2.imread(m_input_image_filename)
#
#     h = m_input_image.shape[0]
#     w = m_input_image.shape[1]
#     d = m_input_image.shape[2]
#     cv2.destroyAllWindows()
#
#     return m_input_image.reshape(1,h*w*d),h,w,d
#
# def read_json_file(m_filename):
#     m_out_dict = json.loads(open(m_filename).read())
#
#     return m_out_dict
#
#
#


###################################################
# grab the GPS times of a glitch for a certain ifo
###################################################

[docs]def GrabGPStimesSafechannel(fileid, ifo, SNRthre, glitch, PathSafeChannel, Epochstart, Epochend, Commissioning_lt=None): ''' description: This imports a file containing the output of GravitySpy This take the GPS times during O2 run and take the list of the safe channels and modify it such as it works in gwpy USAGE: GPSs, ids ,re_sfchs = GrabGPStimesSafechannel('/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/GravityspyTrainingset/gspy-db-20180813.csv', 'L1', 7, 'Blip', '/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/ListSaveChannel/L1/O2_omicron_channel_list_hvetosafe_GDS.txt') :param fileid: a file that contains all the meta data of glitches used for training set for GravitySpy :param ifo: a kind of interferometer {L1, H1, V1} :param SNRthre: the minimum threshold of SNR, e.g, 7 :param glitch: a kind of glitch :param PathSafeChannel: the full path of the file of the metadata :param Epochstart: GPS time when a science run begins, float or int :param Epochend: GPS time when a science run ends, float or int :param Commissioning_lt: the set of commissioning times in a list of lists, e.g., [[Cstart1, Cend1], [Cstart2, Cend2]], None in defalt :return GPSs: a list of GPS timse ids: a list of unique id re_sfchs: a list of safe channels ''' # import the data file in pandas format df = pd.read_csv(fileid) # data set of a gitch 'label' seen ifo with SNR is greater than SNRthre df_int = df[(df['ifo']==ifo) & (df['label']==glitch) & (df['snr']>=SNRthre)] # restrict the data within O2 run # within the beginning and end of the run df_int = df_int[(df_int['GPStime'] > Epochstart) & (df_int['GPStime'] < Epochend)] # removing the time of commissioning during the science run if Commissioning_lt !=None: # if there is (are) commissioning for Commissioning in Commissioning_lt: Commissioning_start, Commissioning_end = Commissioning[0], Commissioning[1] df_int = df_int[(df_int['GPStime'] < Commissioning_start) | (df_int['GPStime'] > Commissioning_end)] # the list of GPS times GPSs = np.array(df_int['GPStime'].tolist()) # the list of uniqu ID ids = np.array(df_int['id'].tolist()) # take the list of the safe channel sfchs = pd.read_csv(PathSafeChannel, names=['channel']) # revise the name of the safe channels to mathc gwpy # fist convert the list of the same channels to the numpy format for the sake of convenience sfchs = np.array(sfchs['channel']) # make the empty list for the renamed channels re_sfchs = [] for i in sfchs: # split it two parts before and after '_' appearing the first time from the left splited_name = i.split('_', 1) # reconstructed name re_name = '%s-%s' % (splited_name[0], splited_name[1]) # fill the name re_sfchs.append(re_name) re_sfchs = np.array(re_sfchs) # retrun the list of GPS times, the ids, the name of the safe channels in numpy array format return GPSs, ids, re_sfchs
################################################### # grab the list of the safe channels in IFO ###################################################
[docs]def GrabSafechannel(PathSafeChannel): ''' description: take the list of the safe channels and modify it such as it works in gwpy USAGE: re_sfchs = GrabGPStimesSafechannel('/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/ListSaveChannel/L1/O2_omicron_channel_list_hvetosafe_GDS.txt') :param :PathSafeChannel: the full path of the file of the metadata :return re_sfchs: a list of safe channels ''' # take the list of the safe channel sfchs = pd.read_csv(PathSafeChannel, names=['channel']) # revise the name of the safe channels to mathc gwpy # fist convert the list of the same channels to the numpy format for the sake of convenience sfchs = np.array(sfchs['channel']) # sort the list in alphabetical order sfchs.sort() # make the empty list for the renamed channels re_sfchs = [] for i in sfchs: # split it two parts before and after '_' appearing the first time from the left splited_name = i.split('_', 1) # reconstructed name re_name = '%s-%s' % (splited_name[0], splited_name[1]) # fill the name re_sfchs.append(re_name) re_sfchs = np.array(re_sfchs) # retrun the list of GPS times, the ids, the name of the safe channels in numpy array format return re_sfchs
################################################### # remove the channels that are not used in O2 ####################################################
[docs]def RemoveChannelUnused(re_sfchs, PathListChannelUnused): ''' Description: K.M find that some of the channels in the list of the safe channels are not used in O2 so that gwpy can not get the time series of those channels. This function remove those channels. USAGE: re_sfchs = RemoveChannelUnused(re_sfchs, '/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/ListSaveChannel/L1/O2_omicron_channel_list_hvetosafe_GDS.txt') :param re_sfchs: the list the safe channels in the numpy array format :return re_sfchs: the list the safe channels without unused channels in the numpy array format ''' # remove GDS-CALIB_STRAIN as it is not the safe channel re_sfchs = re_sfchs[re_sfchs!='GDS-CALIB_STRAIN'] UnusedChannels = np.loadtxt(PathListChannelUnused, dtype='str') if UnusedChannels.shape[0] != 0: for unusedChannel in UnusedChannels: re_sfchs = re_sfchs[re_sfchs!=unusedChannel] else: pass ########## remove noisemon channels ############# for ChannelName in re_sfchs: try: if ChannelName.split('-')[1].split('_')[2] == 'NOISEMON': re_sfchs = re_sfchs[re_sfchs!=ChannelName] except IndexError: pass ###################### return re_sfchs
[docs]def ListUsedSafeChannel(path_list_channel, ifo): ''' description: 1. take a path to a .csv file that has lists of channels 2. remove unused safe channel from a list of safe channels USAGE: sfchs = ListUsedSafeChannel(path_list_channel, ifo) :param path_list_channel: a path to a list of channels (.csv file) :param ifo: observatory {L1, H1} in str :return: sfchs: subst of safe channels in numpy array ''' ListChannel_df = pd.read_csv(path_list_channel, names=['H1ListSafeChannel', 'H1ListUnusedChannel', 'L1ListSafeChannel','L1UnusedChannel']) sfchs = np.array(ListChannel_df['{}ListSafeChannel'.format(ifo)].tolist()) UnusedChannels = np.array(ListChannel_df['{}ListUnusedChannel'.format(ifo)].tolist()) for unusedChannel in UnusedChannels: sfchs = sfchs[sfchs != unusedChannel] return sfchs
# # ####################################### # # take the subset of the channels # ####################################### # # def TakeSubsetList(list, subset_num, cor_list=[None]): # ''' # # :param list: an original list # :param subset_num: the size of the output list which is randomly chosen a subset of the original list # :param cor_list: a corresponding list if it is related to the original list (e.g., IDs of the list GPSs given by GravitySpy) # :return: a reduced list, a corresponding list # ''' # print(cor_list[0]) # if cor_list[0] == None: # if len(list) > subset_num: # if the size of an original list is greater than the size of the subset # list_subset = np.array(rd.sample(list, subset_num)) # else: # if not # list_subset = list # return list_subset # else: # if len(list) > subset_num: # if the size of an original list is greater than the size of the subset # list_subset = np.array(rd.sample(list, subset_num)) # else: # if not # list_subset = list # # the index having the elements of the subset in list # ind_subset = [np.where(list==i)[0][0] for i in list_subset] # # the subset of cor_list # cor_list_subset = cor_list[ind_subset] # return list_subset, cor_list_subset # # ################################################################ # # write out the output of the safe channels around GPS times # ############################################################### # # def SaveTimeseriesHdf5(GPSs, ids, re_sfchs, ifo, wd, outputpath, outputfilename): # ''' # This extracts the output of the safe channels around the GPS times # input: # GPSs: the list of GPS times for a given glitch # ids: the list of unique IDs labeled GravitySpy # re_sfchs: the list the safe channels # ifo: kind of interferomter {'H1', 'L1', 'V1'} # wd: forward and backward time window around the peak GPS time # outputpath: Path of an output directory # outputfilename: the name of a output file where the file stores the time series of a glitch # output: # hdf5 file # data structure: group: "gps0" has dataset "data0", "data1", ... # dataset: "data0" has times series of every safe channels # dataset attributes: "GPS", "channel", "SamplingRate" # description: # 1- if there is a group about a GPS exists # 1- if there are full channel data sets stored in this group # 1- if there actually time series stored in each data set # -> skip this group and go to the next GPS iteration # 2- if there is at least an empty data set in this group # -> delete this group and re-create this group, then extract data sets from the server # 2- if there are no or not enough data sets stored in this group # -> delete this group and re-create this group, then extract data sets from the server # # USAGE:write_output_SafeChannel('test.') # ''' # # the path and the file name # ouputPathFileName = os.path.join(outputpath, outputfilename) # # make a log file # logFileName = '{}LOG.txt'.format(outputfilename.split('.')[0]) # log file name # logfilePathName = os.path.join(outputpath, logFileName) # # path and the file name of the log file # logfileobj = open(logfilePathName, 'w') # Truncate file to zero length or create text file for writing # # # looping with GPS times and the list of id # for group_id, (GPS, id)in enumerate(zip(GPSs, ids), 0): # try: # create a file in hdf5 format it there is no such a file # f = h5py.File(ouputPathFileName, 'a') # except IOError: # f = h5py.File(ouputPathFileName, 'r+') # # note, group_id is used as a label for a group in hdf5 file # # make a group in the file as, for example, gps00001 # try: # create a group if it does not exist # g = f.create_group("gps{:05d}".format(group_id)) # # add attributes # g.attrs['GPS'] = GPS # g.attrs['ifo'] = ifo # g.attrs['id'] = id # except ValueError: # this group already exists # # read this group # g = f["gps{:05d}".format(group_id)] # # check there are every safe channels data stored in this group # # check if there are all the channels in this group (e.g., 815) # if len(g.keys()) == len(re_sfchs): # yes, there are data sets with the same number of the channels # # check if there are actually time series stored in this dataset # for dset_name in g.keys(): # if g['{}'.format(dset_name)].value.shape[0] == 0: # no, some dataset is empty # argu = 'there is at least an empty dataset' # for the next evaluation # break # get out this loop # else: # yes, all the data sets are filled with data # argu = 'there are full datasets' # for the next evaluation # pass # if argu == 'there is at least one empty dataset': # del f['gps{:05d}'.format(group_id)] # delete this group # elif argu == 'there are full datasets': # print('group{:05d} out of {} groups: there are full datasets around GPS time {}'.format(group_id, len(GPSs),GPS)) # log1 = 'group{:05d} out of {} groups at GPS time {} was already done. ID:{}'.format(group_id, len(GPSs),GPS, id) # logfileobj.write(log1) # take a log # logfileobj.write('\n') # to the next line # continue # go to the next GPS iteration # else: # no, there are less or no data sets with the same number of the channels # del f['gps{:05d}'.format(group_id)] # delete this group # try: # re-create this group # g = f.create_group("gps{:05d}".format(group_id)) # g.attrs['GPS'] = GPS # g.attrs['ifo'] = ifo # g.attrs['id'] = id # except ValueError: # pass # # # the list of channel names # chnames = ['{0}:{1}'.format(ifo, i) for i in re_sfchs] # # start time # tstart = GPS - wd # # end time # tend = GPS + wd # try: # extract time series from the server # tbeg = time.time() # print('group{:05d} out of {} groups : taking data set around GPS time {} ...'.format(group_id, len(GPSs), GPS)) # FrameType = '{}_R'.format(ifo) # data = TimeSeriesDict.get(chnames, tstart, tend, frametype=FrameType, verbose=True) # tfin = time.time() # print('It took {0} sec for getting data at {1}'.format(float(tfin-tbeg), GPS)) # # make log # log2 = 'group{:05d} out of {} groups at GPS time {} is now done. ID:{}'.format(group_id, len(GPSs),GPS, id) # logfileobj.write(log2) # take a log # logfileobj.write('\n') # go to the new line # except IndexError: # # this gives, e.g., IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ISI-GND_STS_ETMY_X_DQ offset: 1153856716) # log3 = 'IndexError: group{:05d} out of {} groups st GPS time {} was SKIPPED. ID:{}'.format(group_id, len(GPSs), GPS, id) # print(log3) # logfileobj.write(log3) # take a log # logfileobj.write('\n') # go to the new line # continue # go to the next GPS iteration # # store the data # for dset_id, channel in enumerate(chnames, 0): # # time series of the data # ts = data[channel].value # # sampling rate in Hz # samplingrate = data[channel].sample_rate.value # # store the time series with compression # dset = g.create_dataset("ch{:05d}".format(dset_id), data=ts, compression="gzip", compression_opts=9) # # make attributes on the data set 'dset' # dset.attrs['channel'] = channel.split(':')[1] # dset.attrs['SamplingRate'] = samplingrate # f.close() # logfileobj.close() # return None # #
[docs]def Multiprocess_ConvertToTable(cache_indiv, trigger_pipeline, IFO, Columns): ''' description: Multi process does not work if I include this inside the class IdentifyGlitch() so that I define this here globally :param cache_indiv: an individual cache (each trigger file) :param trigger_pipeline: a name of trigger pipeline {omicron, pycbc-live} :param IFO: a name of the detector {L1, H1} :param Columns: a list of columns for a metadata. This is None for omicron trigger as it is nother to do :return: df_indiv: a metadata of triggers in pandas frame ''' if trigger_pipeline == 'omicron': try: t = EventTable.read(cache_indiv, format='ligolw', tablename='sngl_burst') df_indiv = t.to_pandas() # convert it to a pandas table except KeyError: df_indiv = pd.DataFrame() except ValueError: df_indiv = pd.DataFrame() elif trigger_pipeline == 'pycbc-live': try: t = EventTable.read(cache_indiv, format='hdf5.pycbc_live', ifo='{}'.format(IFO), columns=Columns) df_indiv = t.to_pandas() # convert it to pandas frame except KeyError: df_indiv = pd.DataFrame() return df_indiv
[docs]class IdentifyGlitch(): def __init__(self): pass
[docs] def FindObservingTimeSegments(self, IFO, startT, endT, outputMother_dir, state='observing'): ''' description: 1. take a DataQualityFrag from a server 2. save its segment data as a HDF file 3. take active segments 4. return active segments as a numpy array USAGE: SegmentsMat, trigger_dir = FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state) :param IFO: a type of interferometer, 'L1' or 'H1' :param startT: (float, int or string: e.g., 'Dec 8 2016') starting time :param endT: ending time :param outputMother_dir: an ouput directory where the HDF5 file will be stored :param state: state of an interferometer, {observing, nominal-lock} :return: SegmentsMat: active segments in a numpy array ''' # choose a state channel if state == 'observing': StateChannel = 'DMT-ANALYSIS_READY:1' elif state == 'nominal-lock': StateChannel = 'DMT-GRD_ISC_LOCK_NOMINAL:1' else: print('Please a state of this interferometer either "observing" or "nominal-lock"') sys.exit() # take segments data from a server segs = DataQualityFlag.query('{0}:{1}'.format(IFO, StateChannel), startT, endT) # path of an output directory trigger_dir = os.path.join(outputMother_dir, '{}_{}'.format(int(startT), int(endT))) try: os.mkdir(trigger_dir) except OSError: pass # path of an output file output_path = os.path.join(trigger_dir, 'DataQualityFlag.h5') # save it as a HDF5 file try: segs.write(output_path) except IOError: # if this file already exists # delete the file os.remove(output_path) # re-create the file segs.write(output_path) # import this HDF5 file segs_h5 = h5py.File(output_path, 'r') # get active segments Active_segs = segs_h5['{0}:{1}'.format(IFO, StateChannel)]['active'][()] SegmentsMat = np.zeros((Active_segs.size, 2)) for ind, seg in enumerate(Active_segs, 0): SegmentsMat[ind, 0] = seg[0] # starting time SegmentsMat[ind, 1] = seg[2] # ending time SegmentsMat = SegmentsMat.astype(int) return SegmentsMat, trigger_dir
[docs] def OmimcronTriggerPath(self, ListSegments, TriggerDir, IFO, output_file='omicron.txt', channel='GDS-CALIB_STRAIN'): ''' description: 1. log-in either Livingston or Hanford cluster 2. take the path of files storing omicron triggers during a segment 3. save those paths into an output file USAGE: OmimcronTriggerPath('L1', ListSegments, '/home/kentaro.mogushi/longlived/OmicronTrigger', IFO) :param IFO: interferometer (L1, or H1) :param ListSegments: (list of lists) [[s1, e1], [s2, e2], ...], this is because I want to exclude segments of non-observing times :param channel: (str) the name of a channel :param trigger_dir: (str) an output directory :param output_file: (str) an output file :param: IFO: :return: trigger_dir ''' output_path = os.path.join(TriggerDir, output_file) for ObservingSegment in ListSegments: startT, endT = ObservingSegment[0], ObservingSegment[1] # save paths of all the omicron trigger files between startT and eneT to output_path os.system('/home/detchar/opt/gwpysoft-2.7/bin/trigfind {0}:{1} omicron {2} {3} >> {4}'.format(IFO, channel, startT, endT, output_path)) print('Done for getting a list of paths of trigger files') return None
[docs] def CopyOmicroTriggerandUnzip(self, input_file, input_dir, TriggerDir, output_dir='OmicronTriggerXML'): ''' description: 1. load a file comprising all the paths of omicron trigger files you are interested in 2. copy all the omicron trigger file in your working place 3. go to an output directory 4. unzip all the files and replace them with the zipped files 5. go back to a working directory USAGE: CopyOmicroTriggerandUnzip('omicron.txt', trigger_dir, trigger_dir) :param input_file: an input file :param input_dir: an input directory, in default, a current directory :param trigger_dir: # a directory right above an output directory :param output_dir: # an output directory where all the omicron trigger files will be stored :return: None ''' #++++++++++++++ load a file and copy ++++++++++++++++++++++++++ input_path = os.path.join(input_dir, input_file) # input path f = np.loadtxt(input_path, dtype=np.str) # load an input file output_path = os.path.join(TriggerDir, output_dir) # output path try: os.mkdir(output_path) # mkdir an output directory except OSError: pass # do nothing if the directory already exists for i in f: # iterate through paths of omicron triggers stored in an input file copy = 'cp {0} {1}'.format(i, output_path) # make the command line used in the next line os.system(copy) # copy omicron trigger files into your working place #+++++++++ unzip them +++++++++++++++++++++++++ os.chdir(output_path) # go to an output directory os.system('for f in *.gz ; do gunzip -f "$f" > ./"${f%.*}" ; done') # unzip all the file # -f replaces the original .xml.gz file with .xml # -c unzip .xml.gz file and also keep the .xml file os.chdir(TriggerDir) # go back to a working directory return None
[docs] def readXML(self, input_dir, input_file): ''' description: 1. take meta data stored s a .xml file in a directory named OmicronTriggerXML 2. meta data matrix in the form of list USAGE: TriggerMat = readXML(input_dir, input_file) :param input_dir: an input directory :param input_file: input file name :return: name_a: (numpy array) ''' # read a XML file path_input = os.path.join(input_dir, input_file) # input path tree = ET.parse(path_input) # tree of an input file root = tree.getroot() # root of an input file #++++++++ get the omicron trigger's data set++++++++++++++++ g = [] for i in root[2]: a = str(i.text) g.append(a) # trim meta data for the sake of convenience h = np.array(g) # convert list object to numpy array ind_nan = np.where(h == 'None') # find the indices where element is 'None' h_new = np.delete(h, ind_nan) # without nan a = h_new[0].split('\n\t\t\t') # split a[-1] = a[-1].replace('\n\t\t', '') # remove the last as the last one is redundant a_new = a[1:] # get rid of the first one which is redundant and make another array # +++++++++ get the name of the data set (columns)++++++++++++++++ name_list = [] # name column names for i in root[2]: names = str(i.attrib) name_list.append(names) for idx in range(len(name_list)): name_list[idx] = name_list[idx].split(':')[-1].replace("'}", "") # remove the last one #name_list = name_list[:-1] #name_com = name_list[0] + '' # combine every name name_com = '' for i in name_list: name_com = name_com + i + ',' # remove the ',' at the end and make it as list name_com = [name_com[:-1]] # combine the name and numbers name_a = name_com + a_new return name_a
[docs] def ExtractOmcronTriggerMetadata(self, name_a): ''' description: 1. load meta data matrix, expected to be created by readXML() 2. re-arrange for the sake of convenience and convert it in a numpy array 3. return numpy re-arranged meta data matrix USAGE: Matdataset = ExtractOmcronTriggerMetadata(name_a) :param name_a: omicron trigger info (list) :return: Matdataset: omicron trigger metadata (numpy array) ''' # create the list of peak GPS times and durations Matdataset = np.array([]) a_new = name_a[1:] for i in a_new: peakGPS = float(i.split(',')[1]) # peak GPS time peakGPSns = float('0.'+i.split(',')[2]) # peak GPS time ns peakGPSfull = peakGPS+peakGPSns # full peak GPS time startGPS = float(i.split(',')[3]) # start GPS time startGPSns = float('0.'+i.split(',')[4]) # start GPS time ns startGPSfull = startGPS+startGPSns # full start GPS time peakFreq = float(i.split(',')[9]) snr = float(i.split(',')[14]) Amplitude = float(i.split(',')[13]) centralFreq = float(i.split(',')[10]) duration = float(i.split(',')[5]) # duration bandwidth = float(i.split(',')[11]) chisq = float(i.split(',')[16]) chisqDof = float(i.split(',')[17]) confidence = float(i.split(',')[15]) id = None ifo = i.split(',')[0].split('"')[1] DataList = np.array([peakGPSfull, startGPSfull, peakFreq, snr, Amplitude, centralFreq, duration, bandwidth, chisq, chisqDof, confidence, id, ifo]) Matdataset = np.vstack([Matdataset, DataList]) if Matdataset.size else DataList return Matdataset
[docs] def SaveMetaDataAsCSV(self, MetaData, output_dir, output_file): ''' description: 1. load trigger meta data matrix, which is expected to be created by ExtractOmcronTriggerMetadata() 2. add label for these triggers with 'unknown' 3. add imgUrl with None 4. save this matrix as .csv file USAGE: SaveMetaDataAsCSV(AllMatDataStr, trigger_dir, 'OmicrontriggerMetadata.csv') :param MetaData: numpy array, omicron trigger metadata :param output_file: an output file :param output_dir: and output directory :return: None ''' Names = ['GPStime', 'startGPStime', 'peakFreq', 'snr', 'amplitude', 'centralFreq', 'duration', 'bandwidth', 'chisq', 'chisqDof', 'confidence', 'id', 'ifo'] df = pd.DataFrame(MetaData, columns=Names) df = df.assign(label=['arbitrary'] * len(df)) df = df.assign(imgUrl=[None] * len(df)) output_path = os.path.join(output_dir, output_file) df.to_csv(output_path) return None
[docs] def CombinedIndentifyingProcess(self, IFO, ListSegments, TriggerDir): ''' description: 1. use OmimcronTriggerPath() to log in either L1 or H1 cluster and get a list of trigger files paths 2. use CopyOmicroTriggerandUnzip() to copy trigger files and unzip them 3. iterate with trigger XML files 3-1. use readXML() to get mata data stored in a .xml file 3-2. use ExtractOmcronTriggerMetadata() to re-arrange meta data matrix 4. save the matrix as .csv file 5. copy the .csv file into the CIT cluster and go back to the CIT cluster USAGE: CombinedIndentifyingProcess(self, 'L1', ListSegments, '/home/kentaro.mogushi/longlived/OmicronTrigger') :param IFO: ifo {H1, L1, V1} :param ListSegments: a list of segments :param TriggerDir: a mother directory of a trigger file :return: None ''' print('Begin to get trigger meta data') # get a list of trigger files paths print('get trigger files paths') self.OmimcronTriggerPath(ListSegments, TriggerDir, IFO) # copy trigger files and unzip them print('copy trigger files and unzip them') self.CopyOmicroTriggerandUnzip('omicron.txt', TriggerDir, TriggerDir) # path of trigger XML files omicrontriggerXMLdir = os.path.join(TriggerDir, 'OmicronTriggerXML') # a list of trigger files ListTriggerFiles = os.listdir(omicrontriggerXMLdir) AllMatData = np.array([]) print('creating meta data matrix') for TrigerFile in ListTriggerFiles: # iterate with trigger files TriggerMat = self.readXML(omicrontriggerXMLdir, TrigerFile) # get trigger meta data Matdataset = self.ExtractOmcronTriggerMetadata(TriggerMat) # re-arrange this meta data AllMatData = np.vstack([AllMatData, Matdataset]) if AllMatData.size else Matdataset # combine matrices self.SaveMetaDataAsCSV(AllMatData, TriggerDir, 'OmicrontriggerMetadata.csv') # save the matrix as .csv file print('Done!') return None
[docs] def pyCBC_SNR_filter(self, all_pycbc_triggers_frame, SNR_low_cut, SNR_high_cut): ''' description: band pass filter with SNR for pycbc triggers USAGE: df_SNR_cut = pyCBC_SNR_filter(all_pycbc_triggers_frame, SNR_low_cut=7.5, SNR_high_cut=150) :param all_pycbc_triggers_frame: a pandas frame of all the pyCBC triggers :param SNR_low_cut: a lower cutoff SNR :param SNR_high_cut: a higher cutoff SNR :return: df_new: pycbc triggers that pass the all the condition defined above ''' df_new = all_pycbc_triggers_frame[(all_pycbc_triggers_frame['snr'] >= SNR_low_cut) & (all_pycbc_triggers_frame['snr'] < SNR_high_cut)] return df_new
[docs] def pyCBC_template_duration_filter(self, pycbc_triggers_frame, low_cutoff_duration, high_cutoff_duration): ''' description: band pass filter with template duration for pycbc triggers USAGE: df_template_duration_cut = pyCBC_template_duration_filter(pycbc_triggers_frame, low_cutoff_duration, high_cutoff_duration) :param pycbc_triggers_frame: pyCBC triggers in pandas frame :param low_cutoff_duration: a lower cutoff of the template duration in sec :param high_cutoff_duration: a higher cutoff of the template duration in sec :return: df_new: triggers with template druation between low_cutoff_duration and high_cutoff_duration ''' df_new = pycbc_triggers_frame[(pycbc_triggers_frame['template_duration'] >= low_cutoff_duration) & (pycbc_triggers_frame['template_duration'] < high_cutoff_duration)] return df_new
[docs] def pyCBC_totalmass_filter(self, pycbc_triggers_frame, low_cutoff_totalmass, high_cutoff_totalmass): ''' description band pass filter with total mass for pycbc triggers USAGE: df_totalmass_cut = pyCBC_totalmass_filter(pycbc_triggers_frame, low_cutoff_totalmass, high_cutoff_totalmass) :param pycbc_triggers_frame: pyCBC triggers in pandas frame :param low_cutoff_totalmass: a lower cutoff of the total mass in solar mass :param high_cutoff_totalmass: a higher cutoff of the total mass in solar mass :return: triggers with total mass between low_cutoff_totalmass and high_cutoff_totalmass ''' pycbc_triggers_frame = pycbc_triggers_frame.assign(totalmass=pycbc_triggers_frame.mass1+pycbc_triggers_frame.mass2) df_new = pycbc_triggers_frame[(pycbc_triggers_frame['totalmass'] >= low_cutoff_totalmass) & (pycbc_triggers_frame['totalmass'] < high_cutoff_totalmass)] return df_new
[docs] def pyCBC_massratio_filter(self, pycbc_triggers_frame, low_cutoff_massratio, high_cutoff_massratio): ''' description band pass filter with total mass for pycbc triggers USAGE: df_massratio_cut = pyCBC_query_massratio(pycbc_triggers_frame, low_cutoff_massratio, high_cutoff_massratio) :param pycbc_triggers_frame: pyCBC triggers in pandas frame :param low_cutoff_totalmass: a lower cutoff of the total mass in solar mass :param high_cutoff_totalmass: a higher cutoff of the total mass in solar mass :return: df_new: triggers with mass ratio between low_cutoff_massratio and high_cutoff_massratio ''' pycbc_triggers_frame = pycbc_triggers_frame.assign(massratio=pycbc_triggers_frame.mass1/pycbc_triggers_frame.mass2) df_new = pycbc_triggers_frame[(pycbc_triggers_frame['massratio'] >= low_cutoff_massratio) & (pycbc_triggers_frame['massratio'] < high_cutoff_massratio)] return df_new
[docs] def calculate_chisqr_weighted_snr(self, snr, chisq, chisq_dof): ''' description: calculate the chi-square weighted SNR Reference: Macleod et al. 2015, Equation (21) USAGE: chisqr_weighted_snr = calculte_chisqr_weighted_snr(self, snr, chisqr, chisqr_dof) :param snr: SNR :param chisqr: chi-square :param chisqr_dof: chi-square degrees of freedom :return: chisqr_weighted_snr: the chi-square weighted SNR ''' if chisq <= chisq_dof: chisq_weighted_snr = snr elif chisq > chisq_dof: chisq_weighted_snr = snr * ( (1 + ( chisq / chisq_dof ) ** 3 )/2. )**(-1/6.) return chisq_weighted_snr
[docs] def pyCBC_chisqr_weighted_snr_filter(self, pycbc_triggers_frame, chisqr_weighted_snr_lower_cutoff): ''' description high pass filter with chi square weighted SNR for pycbc triggers USAGE: df_chi_square_weighted_cut = pyCBC_chisqr_weighted_snr_filter(pycbc_triggers_frame, chisqr_weighted_snr_lower_cutoff) :param pycbc_triggers_frame: pyCBC triggers in pandas frame :param chisqr_weighted_snr_lower_cutoff: a lower cutoff chi-square weighted snr :return: df_new: triggers with chi square weighted SNR above chisqr_weighted_snr_lower_cutoff ''' pycbc_triggers_frame = pycbc_triggers_frame.assign(chisqr_weighted_snr= np.vectorize(self.calculate_chisqr_weighted_snr)(pycbc_triggers_frame.snr, pycbc_triggers_frame.chisq, pycbc_triggers_frame.chisq_dof.astype('float'))) df_new = pycbc_triggers_frame[pycbc_triggers_frame.chisqr_weighted_snr > chisqr_weighted_snr_lower_cutoff] return df_new
[docs] def pyCBC_query_outlier(self, pycbc_triggers_frame, BinNum, Nsigma, cut): ''' description: 1. bin the triggers with values of log10 of chi-square per DOF 2. calculate the lower bound of log10 of chi-square per DOF in the bin 3 (cut = 'median') Calculate the median of log10 of SNR and log10 of chi square per degree of freedom in the bin 3.(cut = 'mad') Calculate the upper bound of log10 of SNR in the bin where the lower bound is the median minus the median absolute deviation, the upper bound is the median plus the median absolute deviation 4. Polynomial fit the upper bound of log10 of SNR as a function of the lower bound of log10 of chi-square per DOF 5. split triggers using the polynomial fit to loud and quiet triggers USAGE: pycbc_loud = pyCBC_query_outlier(pycbc_triggers_frame, BinNum=50, Nsigma=1, cut='median') :param pycbc_triggers_frame: pycbc triggers in pandas frame :param BinNum: the number of bins of a histogram for log10 of chi squar per degrees of freedom :param Nsigma: an integer to determine the upper bound of the quiet triggers :param cut: a method of cut {median or mad} :return: pycbc_loud: loud triggers ''' # log10 of SNR pycbc_triggers_frame = pycbc_triggers_frame.assign(log_SNR= np.log10(pycbc_triggers_frame.snr)) # log10 of chi-square per its degree of freedom pycbc_triggers_frame = pycbc_triggers_frame.assign(log_chisqr_per_dof=np.log10(pycbc_triggers_frame.chisq/pycbc_triggers_frame.chisq_dof)) Bins_log_chisqr_per_dof = np.linspace(pycbc_triggers_frame.log_chisqr_per_dof.min(), pycbc_triggers_frame.log_chisqr_per_dof.max(), BinNum, endpoint=True) if cut =='median': # ------------------------------------------------------------------------------------+ # important outlier is lower part of off-diagonal line of chi_square per DQF VS SNR | # ------------------------------------------------------------------------------------+ # a list of medians of log10 of SNR list_median_log_SNR = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_SNR.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])]) # a list of medians of log10 of chi-square per its degree of freedom list_median_log_chisqr_per_dof = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_chisqr_per_dof.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])]) #-----------------------------------+ # clean up the representative curve | #-----------------------------------+ # find the index of elements = nan for the median values ---------- list_index_nan_median_log_SNR = np.where(np.isnan(list_median_log_SNR) == True)[0] # for median of log SNR list_index_nan_median_log_chisqr_per_dof = np.where(np.isnan(list_median_log_chisqr_per_dof) == True)[0] # for median of log of chisqr per DOF # concatenate the lists of indices and keep non-dupulicated ones list_drop_index = np.unique(np.concatenate((list_index_nan_median_log_SNR, list_index_nan_median_log_chisqr_per_dof))) # delete the elements = nan list_median_log_SNR_cleaned = np.delete(list_median_log_SNR, list_drop_index) list_median_log_chisqr_per_dof_cleaned = np.delete(list_median_log_chisqr_per_dof, list_drop_index) # ----------+ # best fit | # ----------+ # the polynomial curve is y = p[0] * x**3 + p[1] * x**2 + p[2] * x + p[3] # where x is log 10 of median values of chi square per degrees of freedom # y is log10 of SNR p = np.polyfit(list_median_log_chisqr_per_dof_cleaned, list_median_log_SNR_cleaned, 3) #------------+ # split | #------------+ # loud pycbc triggers are triggers with SNR above the upper cut defined above pycbc_loud = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR > p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]] # loud pycbc triggers are triggers with SNR below the upper cut defined above pycbc_quite = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR <= p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]] elif cut =='mad': # ------------------------------------------------------------------------------------+ # important outlier is lower part of off-diagonal line of chi_square per DQF VS SNR | # ------------------------------------------------------------------------------------+ # a list of medians of log10 of SNR list_median_log_SNR = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_SNR.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])]) # a list of medians of log10 of chi-square per its degree of freedom list_median_log_chisqr_per_dof = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_chisqr_per_dof.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])]) # a list of median absolute deviations of log10 of SNR list_mad_log_SNR = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_SNR.mad() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])]) # a list of median absolute deviations of log10 of chi-square per its degree of freedom list_mad_log_chisqr_per_dof = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_chisqr_per_dof.mad() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])]) #-----------------------------------+ # clean up the representative curve | #-----------------------------------+ # find the index of elements = nan or zero for the upper bound list_index_nan_mad_log_SNR = np.where(np.isnan(list_mad_log_SNR) == True)[0] # nan for median of log SNR list_index_zero_mad_log_SNR = np.where(list_mad_log_SNR == 0)[0] # zero for median of log SNR list_index_nan_mad_log_chisqr_per_dof = np.where(np.isnan(list_mad_log_chisqr_per_dof) == True)[0] # nan for median of log of chisqr per DOF list_index_zero_mad_log_chisqr_per_dof = np.where(list_mad_log_chisqr_per_dof == 0)[0] # nan for median of log of chisqr per DOF # concatenate the lists of indices and keep non-dupulicated ones list_drop_index_bd = np.unique(np.concatenate((list_index_nan_mad_log_SNR, list_index_zero_mad_log_SNR, list_index_nan_mad_log_chisqr_per_dof, list_index_zero_mad_log_chisqr_per_dof))) # delete the elements = nan or zero list_upper_bd_log_SNR = np.delete(list_median_log_SNR, list_drop_index_bd) + Nsigma * np.delete(list_mad_log_SNR, list_drop_index_bd) list_lower_bd_log_chisqr_per_dof = np.delete(list_median_log_chisqr_per_dof, list_drop_index_bd) - Nsigma * np.delete(list_mad_log_chisqr_per_dof, list_drop_index_bd) #----------+ # best fit | #----------+ # the polynomial curve is y = p[0] * x**3 + p[1] * x**2 + p[2] * x + p[3] # where x is log 10 of median + Nsigma * median absolute deviation of chi square per degrees of freedom # y is log10 of SNR p = np.polyfit(list_lower_bd_log_chisqr_per_dof, list_upper_bd_log_SNR, 3) #-------+ # split | #-------+ # loud pycbc triggers are triggers with SNR above the upper cut defined above pycbc_loud = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR > p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]] # loud pycbc triggers are triggers with SNR below the upper cut defined above pycbc_quite = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR <= p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]] # #------------------ # # plot # #------------------ # import matplotlib # matplotlib.use('Agg') # import matplotlib.pyplot as plt # y = np.linspace(pycbc_triggers_frame.log_chisqr_per_dof.min(), pycbc_triggers_frame.log_chisqr_per_dof.max(),100) # x_poly = p[0] * y**3 + p[1] * y**2 + p[2] * y**1 + p[3] # plt.scatter(10**pycbc_loud.log_SNR, 10**pycbc_loud.log_chisqr_per_dof, s=3, color='blue', label='loud trigger') # plt.scatter(10 ** pycbc_quite.log_SNR, 10 ** pycbc_quite.log_chisqr_per_dof, s=3, color='cornflowerblue',label='quite trigger') # if cut == 'median': # plt.scatter(10**list_median_log_SNR_cleaned, 10**list_median_log_chisqr_per_dof_cleaned, marker='x', color='r', label='median') # plt.plot(10 ** x_poly, 10 ** y, ls='--', color='black', label='poly fit') # elif cut == 'mad': # plt.scatter(10 ** list_upper_bd_log_SNR, 10 ** list_lower_bd_log_chisqr_per_dof, marker='x', color='magenta',label='ub') # plt.plot(10 ** x_poly, 10 ** y, ls='-', color='black', label='poly fit ub') # plt.yscale('log') # plt.xscale('log') # plt.xlabel('SNR') # plt.ylabel('Chi-square per DOF') # plt.legend() # plt.savefig('pycbc.png') # plt.close() return pycbc_loud
[docs] def pycbc_clustering_window_around_trigger(self, pycbc_trigger, IFO, one_sided_window, extension_duration): ''' description: This is a clustering filter 1. take a frame of pycbc triggers 2. pick up a trigger with highest SNR in a window around a trigger. The window size is twice of "one_sided_window" 3. create new columns required to run this code USAGE: pycbc_trigger = pycbc_clustering_window_around_trigger(pycbc_trigger, IFO, one_sided_window=0.1, extension_duration=1.5) :param pycbc_trigger: pycbc triggers in pandas frame format :param IFO: ifo :param one_sided_window: one sided window in seconds around a trigger for clustering, :param extension_duration: factor for extending the template duration, e.g., extension_duration = 1.5 makes the duration of the on-source window 1.5 times longer than that of the trigger :return: pycbc_trigger: clustered pycbc triggers ''' pycbc_trigger = pycbc_trigger.sort_values(by=['end_time']) pycbc_trigger = pycbc_trigger.reset_index(drop=True) for index in pycbc_trigger.index: # iterate sub-samples try: # most of pycbc triggers are overlapped for a same burst, I remove redundant triggers #one_sided_window = 0.1 # one sided time window start_window = float(pycbc_trigger.loc[index]['end_time'] - one_sided_window) # starting time of the window for a trigger end_window = float(pycbc_trigger.loc[index]['end_time'] + one_sided_window) # ending time of the window for a trigger df_overlapped = pycbc_trigger[(start_window <= pycbc_trigger['end_time']) & (pycbc_trigger['end_time'] <= end_window)] # triggers overlapping this trigger max_snr = df_overlapped['snr'].max() # a trigger with a maximum trigger during this window event_ind = df_overlapped[df_overlapped['snr'] == max_snr]['end_time'].index[0] # index for the trigger with the maximum SNR all_index = np.array(df_overlapped.index.tolist()) # a list of index of the overlapping triggers redundant_indecies = all_index[all_index != event_ind] # a list of index of the triggers that are redundant pycbc_trigger = pycbc_trigger.drop(redundant_indecies, axis=0) # drop the redundant triggers # saved_index = np.append(saved_index, event_ind) # save a trigger with highest SNR except KeyError: # if this trigger is already removed, go to the next iteration continue ######################### # get the meta data pycbc_trigger = pycbc_trigger.assign(GPStime=pycbc_trigger['end_time']) pycbc_trigger = pycbc_trigger.assign(duration=pycbc_trigger['template_duration'] * extension_duration) pycbc_trigger = pycbc_trigger.assign(ifo=[IFO] * pycbc_trigger.shape[0]) pycbc_trigger = pycbc_trigger.assign(id=['TP{}'.format(i) for i in pycbc_trigger['template_id']]) pycbc_trigger = pycbc_trigger.assign(label=['arbitrary'] * pycbc_trigger.shape[0]) pycbc_trigger = pycbc_trigger.assign(imgUrl=[None] * pycbc_trigger.shape[0]) pycbc_trigger = pycbc_trigger.assign(confidence=[0] * pycbc_trigger.shape[0]) return pycbc_trigger
[docs] def pycbc_clustering_timeslice(self, pycbc_trigger, IFO, startT, endT, window, extension_duration): ''' description: This is a clustering filter 1. take a frame of pycbc triggers 2. pick a trigger with highest SNR in a window that is a time-sliced bin 3. create new columns required to run this code USAGE: pycbc_trigger = pycbc_clustering_timeslice(pycbc_trigger, IFO, startT, endT, window=0.1, extension_duration=1.5) :param pycbc_trigger: pycbc triggers in pandas frame :param IFO: ifo :param startT: start time of an epoch :param endT: end time of an epoch :param window: a window length in sec for clustering :param extension_duration: factor for extending the template duration, e.g., extension_duration = 1.5 makes the duration of the on-source window 1.5 times longer than that of the trigger :return: pycbc_trigger: clustered pycbc triggers ''' pycbc_trigger = pycbc_trigger.sort_values(by=['end_time']) pycbc_trigger = pycbc_trigger.reset_index(drop=True) time_bins = np.arange(startT, endT, window) for start_bin in time_bins: df_overlapped = pycbc_trigger[(pycbc_trigger.end_time >= start_bin) & (pycbc_trigger.end_time < start_bin + window)] if df_overlapped.shape[0] != 0: max_snr = df_overlapped['snr'].max() # a trigger with a maximum trigger during this window event_ind = df_overlapped[df_overlapped['snr'] == max_snr]['end_time'].index[0] # index for the trigger with the maximum SNR all_index = np.array(df_overlapped.index.tolist()) # a list of index of the overlapping triggers redundant_indecies = all_index[all_index != event_ind] # a list of index of the triggers that are redundant pycbc_trigger = pycbc_trigger.drop(redundant_indecies, axis=0) # drop the redundant triggers else: continue # get the meta data pycbc_trigger = pycbc_trigger.assign(GPStime=pycbc_trigger['end_time']) pycbc_trigger = pycbc_trigger.assign(duration=pycbc_trigger['template_duration'] * extension_duration) pycbc_trigger = pycbc_trigger.assign(ifo=[IFO] * pycbc_trigger.shape[0]) pycbc_trigger = pycbc_trigger.assign(id=['TP{}'.format(i) for i in pycbc_trigger['template_id']]) pycbc_trigger = pycbc_trigger.assign(label=['arbitrary'] * pycbc_trigger.shape[0]) pycbc_trigger = pycbc_trigger.assign(imgUrl=[None] * pycbc_trigger.shape[0]) pycbc_trigger = pycbc_trigger.assign(confidence=[0] * pycbc_trigger.shape[0]) return pycbc_trigger
[docs] def GetTriggerMetadata(self, ListSegments, IFO, output_dir, number_process, trigger_pipeline='omicron', output_file='TriggerMetadata.csv', channel='GDS-CALIB_STRAIN'): ''' description: 1. take the path of files storing omicron triggers during a segment 2. make a metadata in pandas frame 3. save it as a .csv file Note: this function works samely as CombinedIndentifyingProcess() but it is faster. And it is supposed to support pyCBC trigger as well. USAGE: GetTriggerMetadata(ListSegments, IFO, output_dir, number_process, trigger_pipeline='omicron', output_file='TriggerMetadata.csv', channel='GDS-CALIB_STRAIN') :param ListSegments: (list of lists) [[s1, e1], [s2, e2], ...], this is because I want to exclude segments of non-observing times :param IFO: interferometer (L1, or H1) :param output_dir: an output directory :param output_file: a name of an output file :param number_process: a number maximum processes in parallel :param trigger_pipeline: trigger method 'omicron', 'pycbc-live' :param channel: (str) the name of a channel :return: ListXML: a list of trigger XML files ''' if trigger_pipeline == 'omicron': ListTable = [] for ObservingSegment in ListSegments: # iterate through a list of segments startT, endT = ObservingSegment[0], ObservingSegment[1] # get trigger files try: cache = find_trigger_files('{0}:{1}'.format(IFO, channel), trigger_pipeline, startT, endT) except ValueError as e: print("******************************") print("You might be using inappropriate cluster. Please use a LLO/LHO cluster for L1/H1 omicron trigger") print("Terminate this program.") print("The original error message was") print("==============================") print(e.message) sys.exit() # convert data to a table try: t = EventTable.read(cache, format='ligolw', tablename='sngl_burst') df = t.to_pandas() # convert it to a pandas table ListTable.append(df) except ValueError: # convert each trigger file to a table with multi processing with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for df_indiv in executor.map(Multiprocess_ConvertToTable, cache, itrtls.repeat(trigger_pipeline), itrtls.repeat(IFO), itrtls.repeat(None)): ListTable.append(df_indiv) except IndexError: # IndexError: cannot read EventTable from empty source list pass FullTriggerSet = pd.concat(ListTable) # concatenate all the tables df_new = pd.DataFrame() # get the meta data df_new = df_new.assign(GPStime=FullTriggerSet['peak_time']+FullTriggerSet['peak_time_ns']*10**(-9)) df_new = df_new.assign(startGPStime=FullTriggerSet['start_time'] + FullTriggerSet['start_time_ns'] * 10 ** (-9)) df_new = df_new.assign(peakFreq=FullTriggerSet['peak_frequency']) df_new = df_new.assign(snr=FullTriggerSet['snr']) df_new = df_new.assign(amplitude=FullTriggerSet['amplitude']) df_new = df_new.assign(centralFreq=FullTriggerSet['central_freq']) df_new = df_new.assign(duration=FullTriggerSet['duration']) df_new = df_new.assign(bandwidth=FullTriggerSet['bandwidth']) df_new = df_new.assign(chisq=FullTriggerSet['chisq']) df_new = df_new.assign(chisqDof=FullTriggerSet['chisq_dof']) df_new = df_new.assign(confidence=FullTriggerSet['confidence']) df_new = df_new.assign(id=[None]*FullTriggerSet.shape[0]) df_new = df_new.assign(ifo=FullTriggerSet['ifo']) df_new = df_new.assign(label=['arbitrary'] * FullTriggerSet.shape[0]) df_new = df_new.assign(imgUrl=[None] * FullTriggerSet.shape[0]) print('creating meta data matrix') output_path = os.path.join(output_dir, output_file) # save it as .csv file df_new.to_csv(output_path) print('Done!') elif trigger_pipeline == 'pycbc-live': ListTable = [] for ObservingSegment in ListSegments: # iterate through a list of segments startT, endT = ObservingSegment[0], ObservingSegment[1] # get trigger files cache = find_trigger_files('{0}:{1}'.format(IFO, channel), trigger_pipeline, startT, endT) # trim "file://" for index in range(len(cache)): cache[index] = cache[index].split('file://')[1] cache = filter_pycbc_live_files(cache) # filter out empty files # metadata list #Columns = ['chisq', 'chisq_dof', 'coa_phase', 'end_time', 'f_lower', 'inclination', # 'mass1', 'mass2', 'sigmasq', 'snr', 'spin1', 'spin1y', 'spin1z', 'spin2x', 'spin2y', # 'spin2z', 'template_duration', 'template_hash', 'template_id'] Columns = ['chisq', 'chisq_dof', 'coa_phase', 'end_time', 'f_lower', 'mass1', 'mass2', 'sigmasq', 'snr','template_duration', 'template_hash', 'template_id'] #Columns = ['end_time', 'snr', 'template_duration', 'template_id'] # convert each trigger files to a table with multi processing with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for df_indiv in executor.map(Multiprocess_ConvertToTable, cache, itrtls.repeat(trigger_pipeline), itrtls.repeat(IFO), itrtls.repeat(Columns)): ListTable.append(df_indiv) # fill a pandas table to a list FullTriggerSet = pd.concat(ListTable) # concatenate all the tables # save all the triggers output_path_full = os.path.join(output_dir, 'Fulltrigger.csv') FullTriggerSet.to_csv(output_path_full) print('# of triggers {}'.format(FullTriggerSet.shape[0])) del ListTable ######################## # spacial treatment (pick up samples for analysis) # take triggers with SNR > 7.5 and template duration < 4 to avoid Gaussian noise and long varying noise df_new = self.pyCBC_template_duration_filter(FullTriggerSet, low_cutoff_duration=0, high_cutoff_duration=4) df_new = self.pyCBC_SNR_filter(df_new, SNR_low_cut=7.5, SNR_high_cut=150) #df_new = self.pyCBC_query_outlier(df_new, BinNum=50, Nsigma=1, cut='median') # outlier filter # clustering #df_new = self.pycbc_clustering_window_around_trigger(df_new, IFO, one_sided_window=0.1, extension_duration=1.5) df_new = self.pycbc_clustering_timeslice(df_new, IFO, startT, endT, window=0.1, extension_duration=1.) print('creating meta data matrix') output_path = os.path.join(output_dir, output_file) # save it as .csv file df_new.to_csv(output_path) print('Done!') return None
#return df_loud, df_quiet
[docs] def FindGlitchNearestGPS(self, PathMetadataFile, candidate_GPS): ''' description: 1. take a omicron trigger meta data file 2. find a glitch nearest to an input GPS, which you are concerned about 3. replace the label of this glitch from 'arbitrary' to 'candidate' This function is assumed to be used in DQR. Once GraceDB provides a GPS time, this function labels a glitch nearest glitch given by Omicron Trigger as it can be considered as the most significant candidate of astronomical event. In this way, we can study only the candidate glitch by specifying glitch_type = candidate in a configuration file USAGE: largetSNR = FindGlitchNearestGPS(PathMetadataFile, GPS) :param PathMetadataFile: a path to omicron meta data file :param GPS: a GPS time to be concerned :return: None ''' df = pd.read_csv(PathMetadataFile) start = candidate_GPS - 0.5 # start time of the target window, which is 0.5 sec before a candidate event end = candidate_GPS + 0.5 # end time of the target window, which is 0.5 sec afte ra candidate event # omicron triggers within the time window df_window = df[(df['GPStime'] > start) & (df['GPStime'] < end)] largetSNR = df_window['snr'].max() largestglitchIndex = df_window[df_window['snr'] == largetSNR].index[0] # replace the label of this glitch from 'arbitrary' to 'candidate' df.loc[largestglitchIndex, 'label'] = 'candidate' df.to_csv(PathMetadataFile) return largetSNR
#+++++++++++++++++++++++++++++++++++++ # find the allowed segments #++++++++++++++++++++++++++++++++++++++
[docs]def Findglitchlist(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, UpperDurationThresh, LowerDurationThresh, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf, flag='Both'): ''' description: 1. load Gravity Spy data set (.csv file) 2. get the data about the target glitch class 3. get the subset of the target glitches based on SNR and confidence level threshold a user defines 4. accept glitches where their back ground segment do not coincide with any other glitches 5. return the info of the accepted glitches USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, gap, flag) :param df: GravitySpy meta data in pandas format :param Epochstart: starting time of an epoch :param Epochend: end time of an epoch :param Commissioning_lt: commissioning time in list :param TargetGlitchClass: a target glitch class name (str) :param IFO: a type of interferometer (H1, L1, V1) (str) :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int :param Confidence_thre: a threshold of confidence level (float or int ) :param UpperDurationThresh: an upper bound of duration in sec (float or int) :param LowerDurationThresh: a lower bound of duration in sec (float or int) :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default :param gap: a time gap between the target and the background segments in sec, 1 sec in default :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches :return: the list of parameters of glitches passing the above thresholds Listsegments contains of ListIndexSatisfied: a list of index of glitches Listtarget_timeseries_start: a list of a target glitch starting time Listtarget_timeseries_end: a list of a target glitch ending time Listpre_background_start: a list of preceding background starting time Listpre_background_end; a list of preceding background ending time Listfol_background_start: a list of following background starting time Listfol_background_end: a list of following background ending time Listgpstime: a list of GPS times Listduration: a list of durations ListSNR: a list of SNRs Listconfi: a list of confidence levels ListID: a list of IDs ''' df_list = [] # take the data in an Epoch for Epoch in Epoch_lt: StartT, EndT = Epoch[0], Epoch[1] df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)] df_list.append(df_epoch) df_int = pd.concat(df_list) #### improvement for duration 2019-04-05 ############# #### full durations are too long for capturing the glitch signature so that the solution is to choose T90 ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100 FractionEnergy = 0.9 # (90% of energy ) # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90 #df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration']) df_int = df_int.assign(gap=[gap]*df_int.shape[0]) df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90 # make the time segments of glitches measured # I reserve duration +/- gap around a GPS time of a glitch if UserDefinedDuration > 0: # accept a user defined duration # redefine only the duration of the target glitch df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration else: pass df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap']) df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap']) # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15 # data set containing only a target glitch class try: # gravity spy and omicron trigger mode df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo']==IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] except KeyError: # pyCBC mode as there is no column "peakFreq" df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] print('all the glitches in the class: {}'.format(df_glitch.shape[0])) # the list of values satisfied as secure ListIndexSatisfied = [] Listtarget_timeseries_start = [] Listtarget_timeseries_end = [] Listpre_background_start = [] Listpre_background_end = [] Listfol_background_start = [] Listfol_background_end = [] Listgpstime = [] Listduration = [] ListSNR = [] Listconfi = [] ListID = [] for ind in range(len(df_glitch)): # target glitch's peak GPS time and the duration gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time duration = df_glitch['duration'].iloc[ind] # # a duration SNR = df_glitch['snr'].iloc[ind] # snr confi = df_glitch['confidence'].iloc[ind] # confidence level ID = df_glitch['id'].iloc[ind] # ID gap_individual = df_glitch['gap'].iloc[ind] # the starting and ending time of the target glitch target_timeseries_start = gpstime - duration * position_duration_bfr_centr target_timeseries_end = target_timeseries_start + duration # take the background # the background is around the target glitch # preceding background, which is a nearest neighbor segment before target_timeseries_start # following background, which is a nearest neighbor segment after target_timeseries_end background_duration = duration # duration of both the proceeding and following background pre_background_start = target_timeseries_start - gap_individual - background_duration pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual fol_background_start = target_timeseries_end + gap_individual fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration # security check of the preceding background # case 1 is where the background is coincident with the right edge of the reserved segment # case 2 is where the background is coincident with the left edge of the reserved segment # case 3 is where the reserved segment is inside of the background # case 4 is where the background is inside of the reserved segment # it violates the security of the background estimation if at least one of the cases occurs # i.e., pro_case1, 2, 3, and 4 should be zero pre_case1 = len(df_int[(df_int['reserved_time_end'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 1 pre_case2 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_start'] < pre_background_end)]) # case 2 pre_case3 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 3 pre_case4 = len(df_int[(df_int['reserved_time_start'] < pre_background_start) & (df_int['reserved_time_end'] > pre_background_end)]) # case 4 # security check of the following background fol_case1 = len(df_int[(df_int['reserved_time_end'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 1 fol_case2 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_start'] < fol_background_end)]) # case 2 fol_case3 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 3 fol_case4 = len(df_int[(df_int['reserved_time_start'] < fol_background_start) & (df_int['reserved_time_end'] > fol_background_end)]) # case 4 if flag == 'Both': # take both preceding and following back gounds if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0 and fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: continue if flag == 'Either': # accept if either a preceding or a following background is available if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0: if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where both a proceeding and a following are available ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: # the case where only a proceeding is available ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(0) # fill zero Listfol_background_end.append(0) # fill zero Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where only a following is available ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(0) # fill zero Listpre_background_end.append(0) # fill zero Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: # the case where both are NOT available print('Both a preceding and following are NOT available') continue # zipped list of segments Listsegments = list( zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start, Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration, ListSNR, Listconfi, ListID)) return Listsegments
#=======================================
[docs]def FindglitchlistLongestBG(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'): ''' description: 1. load Gravity Spy data set (.csv file) 2. get the data about the target glitch class 3. get the subset of the target glitches based on SNR and confidence level threshold a user defines 4. accept glitches where their back ground segment do not coincide with any other glitches 5. return the info of the accepted glitches USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, flag) :param df: GravitySpy meta data in pandas format :param Epochstart: starting time of an epoch :param Epochend: end time of an epoch :param Commissioning_lt: commissioning time in list :param TargetGlitchClass: a target glitch class name (str) :param IFO: a type of interferometer (H1, L1, V1) (str) :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int :param Confidence_thre: a threshold of confidence level (float or int ) :param UpperDurationThresh: an upper bound of duration in sec (float or int) :param LowerDurationThresh: a lower bound of duration in sec (float or int) :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default :param gap: a time gap between the target and the background segments in sec, 1 sec in default :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches :return: the list of parameters of glitches passing the above thresholds Listsegments contains of ListIndexSatisfied: a list of index of glitches Listtarget_timeseries_start: a list of a target glitch starting time Listtarget_timeseries_end: a list of a target glitch ending time Listpre_background_start: a list of preceding background starting time Listpre_background_end; a list of preceding background ending time Listfol_background_start: a list of following background starting time Listfol_background_end: a list of following background ending time Listgpstime: a list of GPS times Listduration: a list of durations ListSNR: a list of SNRs Listconfi: a list of confidence levels ListID: a list of IDs ''' df_list = [] # take the data in an Epoch for Epoch in Epoch_lt: StartT, EndT = Epoch[0], Epoch[1] df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)] df_list.append(df_epoch) df_int = pd.concat(df_list) #### improvement for duration 2019-04-05 ############# #### full durations are too long for capturing the glitch signature so that the solution is to choose T90 ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100 FractionEnergy = 1. # (90% of energy ) # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90 # df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration']) df_int = df_int.assign(gap=[gap] * df_int.shape[0]) df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90 # make the target segments # I reserve duration +/- gap around a GPS time of a glitch if UserDefinedDuration > 0: # accept a user defined duration # redefine only the duration of the target glitch df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration else: pass df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap']) df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap']) # reserving_time does not include gap #df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration']) #df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration']) # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15 # data set containing only a target glitch class try: # gravity spy and omicron trigger mode # for cWB triggers #df_glitch = df_int[(df_int.rho >= 10) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff)] df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] except KeyError: # pyCBC mode as there is no column "peakFreq" # df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] # add central frequency cuts df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] print('all the glitches in the class: {}'.format(df_glitch.shape[0])) # the list of values satisfied as secure ListIndexSatisfied = [] Listtarget_timeseries_start = [] Listtarget_timeseries_end = [] Listpre_background_start = [] Listpre_background_end = [] Listfol_background_start = [] Listfol_background_end = [] Listgpstime = [] Listduration = [] ListSNR = [] Listconfi = [] ListID = [] for ind in range(len(df_glitch)): # target glitch's peak GPS time and the duration gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time duration = df_glitch['duration'].iloc[ind] # # a duration SNR = df_glitch['snr'].iloc[ind] # snr confi = df_glitch['confidence'].iloc[ind] # confidence level ID = df_glitch['id'].iloc[ind] # ID gap_individual = df_glitch['gap'].iloc[ind] if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec pass # go to the next line else: # if the duration of a glitch is less than 0.02 sec continue # go to the next iteration # the starting and ending time of the target glitch target_timeseries_start = gpstime - duration * position_duration_bfr_centr target_timeseries_end = target_timeseries_start + duration # take the background # the background is around the target glitch # preceding background, which is a nearest neighbor segment before target_timeseries_start # following background, which is a nearest neighbor segment after target_timeseries_end background_duration = duration # duration of both the proceeding and following background pre_background_start = target_timeseries_start - gap_individual - background_duration pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual fol_background_start = target_timeseries_end + gap_individual fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration # security check of the preceding background # case 1 is where the background is coincident with the right edge of the reserved segment # case 2 is where the background is coincident with the left edge of the reserved segment # case 3 is where the reserved segment is inside of the background # case 4 is where the background is inside of the reserved segment # it violates the security of the background estimation if at least one of the cases occurs # i.e., pro_case1, 2, 3, and 4 should be zero pre_case1 = len(df_int[(df_int['reserved_time_end'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 1 pre_case2 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_start'] < pre_background_end)]) # case 2 pre_case3 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 3 pre_case4 = len(df_int[(df_int['reserved_time_start'] < pre_background_start) & (df_int['reserved_time_end'] > pre_background_end)]) # case 4 # security check of the following background fol_case1 = len(df_int[(df_int['reserved_time_end'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 1 fol_case2 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_start'] < fol_background_end)]) # case 2 fol_case3 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 3 fol_case4 = len(df_int[(df_int['reserved_time_start'] < fol_background_start) & (df_int['reserved_time_end'] > fol_background_end)]) # case 4 if flag == 'Both': # take both preceding and following back gounds if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0 and fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: try: # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0] if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_pre = 10 * duration # replace it with the 10 time of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive pre_background_start = pre_background_end - neareset_gap_pre # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass try: # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0] if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_fol = 10 * duration # replace it with the 10 time of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive fol_background_end = fol_background_start + neareset_gap_fol # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass # #print('{:.1f}, {:.1f}, {:.1f}'.format(neareset_gap_pre, neareset_gap_fol, duration)) print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration)) if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(fol_background_end - fol_background_start, 3) < round(duration, 3): sys.exit() ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: continue if flag == 'Either': # accept if either a preceding or a following background is available if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0: if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where both a proceeding and a following are available try: # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0] if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_pre = 10 * duration # replace it with the 10 times of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive pre_background_start = pre_background_end - neareset_gap_pre # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass try: # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0] if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_fol = 10 * duration # replace it with the 10 times of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive fol_background_end = fol_background_start + neareset_gap_fol # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration)) if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(fol_background_end - fol_background_start, 3) < round(duration, 3): sys.exit() ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: # the case where only a preceding is available try: # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0] if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_pre = 10 * duration # replace it with the 10 times of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive pre_background_start = pre_background_end - neareset_gap_pre # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, 0, duration)) if round(pre_background_end - pre_background_start, 3) < round(duration, 3) : sys.exit() ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(0) # fill zero Listfol_background_end.append(0) # fill zero Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where only a following is available try: # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0] if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_fol = 10 * duration # replace it with the 10 times of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive fol_background_end = fol_background_start + neareset_gap_fol # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass print('{:.1f}, {:.1f}, {:.1f}'.format(0, fol_background_end - fol_background_start, duration)) if round(fol_background_end - fol_background_start, 3) < round(duration, 3): sys.exit() ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(0) # fill zero Listpre_background_end.append(0) # fill zero Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: # the case where both are NOT available print('Both a preceding and following are NOT available') continue # zipped list of segments Listsegments = list( zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start, Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration, ListSNR, Listconfi, ListID)) return Listsegments
[docs]def Findglitchlist_for_timeseries_analysis(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'): ''' description: 1. load Gravity Spy data set (.csv file) 2. get the data about the target glitch class 3. get the subset of the target glitches based on SNR and confidence level threshold a user defines 4. accept glitches where their back ground segment do not coincide with any other glitches 5. return the info of the accepted glitches USAGE: Listsegments = Findglitchlist_for_timeseries_analysis(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'): :param df: GravitySpy meta data in pandas format :param Epochstart: starting time of an epoch :param Epochend: end time of an epoch :param Commissioning_lt: commissioning time in list :param TargetGlitchClass: a target glitch class name (str) :param IFO: a type of interferometer (H1, L1, V1) (str) :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int :param Confidence_thre: a threshold of confidence level (float or int ) :param UpperDurationThresh: an upper bound of duration in sec (float or int) :param LowerDurationThresh: a lower bound of duration in sec (float or int) :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default :param gap: a time gap between the target and the background segments in sec, 1 sec in default :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches :return: the list of parameters of glitches passing the above thresholds Listsegments contains of ListIndexSatisfied: a list of index of glitches Listtarget_timeseries_start: a list of a target glitch starting time Listtarget_timeseries_end: a list of a target glitch ending time Listpre_background_start: a list of preceding background starting time Listpre_background_end; a list of preceding background ending time Listfol_background_start: a list of following background starting time Listfol_background_end: a list of following background ending time Listgpstime: a list of GPS times Listduration: a list of durations ListSNR: a list of SNRs Listconfi: a list of confidence levels ListID: a list of IDs ''' df_list = [] # take the data in an Epoch for Epoch in Epoch_lt: StartT, EndT = Epoch[0], Epoch[1] df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)] df_list.append(df_epoch) df_int = pd.concat(df_list) #### improvement for duration 2019-04-05 ############# #### full durations are too long for capturing the glitch signature so that the solution is to choose T90 ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100 FractionEnergy = 1. # (90% of energy ) # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90 # df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration']) df_int = df_int.assign(gap=[gap] * df_int.shape[0]) df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90 # make the target segments # I reserve duration +/- gap around a GPS time of a glitch if UserDefinedDuration > 0: # accept a user defined duration # redefine only the duration of the target glitch df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration else: pass df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap']) df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap']) # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15 # data set containing only a target glitch class try: # gravity spy and omicron trigger mode # for cWB triggers #df_glitch = df_int[(df_int.rho >= 10) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff)] df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] except KeyError: # pyCBC mode as there is no column "peakFreq" df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] print('all the glitches in the class: {}'.format(df_glitch.shape[0])) # the list of values satisfied as secure ListIndexSatisfied = [] Listtarget_timeseries_start = [] Listtarget_timeseries_end = [] Listpre_background_start = [] Listpre_background_end = [] Listfol_background_start = [] Listfol_background_end = [] Listgpstime = [] Listduration = [] ListSNR = [] Listconfi = [] ListID = [] for ind in range(len(df_glitch)): # target glitch's peak GPS time and the duration gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time duration = df_glitch['duration'].iloc[ind] # # a duration SNR = df_glitch['snr'].iloc[ind] # snr confi = df_glitch['confidence'].iloc[ind] # confidence level ID = df_glitch['id'].iloc[ind] # ID gap_individual = df_glitch['gap'].iloc[ind] if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec pass # go to the next line else: # if the duration of a glitch is less than 0.02 sec continue # go to the next iteration # the starting and ending time of the target glitch target_timeseries_start = gpstime - duration * position_duration_bfr_centr target_timeseries_end = target_timeseries_start + duration # take the background # the background is around the target glitch # preceding background, which is a nearest neighbor segment before target_timeseries_start # following background, which is a nearest neighbor segment after target_timeseries_end background_duration = 15*duration # duration of both the proceeding and following background if background_duration > 15: # reduce computational cost background_duration = 15 else: pass pre_background_start = target_timeseries_start - gap_individual - background_duration pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual fol_background_start = target_timeseries_end + gap_individual fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration)) ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) # zipped list of segments Listsegments = list( zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start, Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration, ListSNR, Listconfi, ListID)) return Listsegments
[docs]def FindRadomlistPoints(state, IFO, Epoch_lt, number_samples, step, outputMother_dir, df_target): ''' description: 1. within an epoch, create a list of synthetic points with randomly chosen with durations following a distribution of that of a target glitch class 2. make pandas frame dataset USAGE: df = FindRadomlistPoints(state, Epoch_lt, number_trials, step) :param state: IFO state {observing, nominal-lock} :param IFO: an observer {H1, L1} :param Epoch_lt: a list of epochs :param number_samples: number of samples picked up :param step: step of data points in sec :param outputMother_dir: an output directory in witch the data set is in :param df_target: true glitch samples generated by an ETG with SNR above an upper threshold of background :return: df: synthetic random data points within an epoch with durations generated from a distribution of a target glitch ''' # -----------------------------------------------------------------------------------------------------+ # get a list of start and end GPS times of an epoch with a given state {'observing', 'nominal-lock'} | # use IdentifyGlitch.FindObservingTimeSegments() | # -----------------------------------------------------------------------------------------------------+ # get a glitch identifier object indentify = IdentifyGlitch() # get start time and end time of an epoch startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1] # get observing segments SegmentsMat, trigger_dir = indentify.FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state) if state == 'nominal-lcok': SegmentsMat[:,0] = SegmentsMat[:, 0] + 300 # remove 5 min after the observing mode begins SegmentsMat[:,1] = SegmentsMat[:, 1] - 60 # remove 1 min before the observing mode ends elif state == 'observing': pass total_seg_used = (SegmentsMat[:,1] - SegmentsMat[:,0]).sum() if number_samples > int(total_seg_used/float(step)): # the number of samples greater than the maximum number of possible samples number_samples = int(total_seg_used/float(step)) # set number of samples equal to the maximum number of possible samples print('User selected number of synthetic samples is greater than the maximum number of possible samples') print('Set the number of samples equal to the maximum number = {}'.format(number_samples)) #-----------------------------------------------------+ # generate synthetic list of center times of samples | #-----------------------------------------------------+ whole_list_center_times = np.array([]) for start_seg_used, end_seg_used in SegmentsMat: length_seg_used = end_seg_used - start_seg_used # length of a segment in sec number_picked = int(number_samples * length_seg_used / float(total_seg_used)) # number of points to be selected in this segment # chop this segment by a given time step, then select "number_picked" points out of "length_seg_used/step" points, without duplication selected_index_center_times = np.random.choice(int(length_seg_used/float(step)), number_picked+1, replace=False) # 1 comes to pick samples a little more than the request at this moment as the sample size might be smaller than than the setting # calculate a list of center times list_center_times = np.array([start_seg_used + step * selected_index_center_time for selected_index_center_time in selected_index_center_times]) # sort this list in ascendingly list_center_times = np.sort(list_center_times) # append this list of center times to the whole list whole_list_center_times = np.append(whole_list_center_times, list_center_times) # whole_list_center_times = whole_list_center_times[:number_samples] # make the sample size equal to the request (take the first chunk) # randomly choose the timestamps where the size of the sample equal to the request whole_list_center_times = np.random.choice(whole_list_center_times, number_samples, replace=False) # ascending order whole_list_center_times = np.sort(whole_list_center_times) #--------------------------------------+ # generate synthetic list of durations | #--------------------------------------+ duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.02 s duration_bin_width = 0.01 # the bin width of duration is set to 0.01 Bins = int((df_target.duration.max() - duration_min) / duration_bin_width) # the number of bins such that the bin width = 0.01 # make a normalized histogram of durations Count, Edge = np.histogram(df_target[(df_target.duration > duration_min) & (df_target.duration <= df_target.duration.max())].duration, bins=Bins, density=True) d_Edge = Edge[1] - Edge[0] # bin width # In gereral, the distribuiton of duration has long tail in a large value # make the sum of area of distribution equal to 1 exactly to generate synthetic set of durations Count[-1] = Count[-1] - (1 - np.sum(Count * d_Edge)) # correct the last element of this distribution Center_bin = Edge[:-1] + d_Edge * 0.5 # center values of each bin # if some of the element shows negative like -1.11022302e-16 Count = np.where(Count >= 0, Count, 0) # generate a synthetic set of durations from the distribution that real durations gen_d = np.random.choice(Center_bin, number_samples, p=Count*d_Edge) # make a pandas dataframe that is compatible with a standard format in my program df = pd.DataFrame() df = df.assign(GPStime=whole_list_center_times) df = df.assign(startGPStime=np.zeros(number_samples)) df = df.assign(peakFreq=np.zeros(number_samples)) df = df.assign(snr=np.zeros(number_samples)) df = df.assign(amplitude=np.zeros(number_samples)) df = df.assign(centralFreq=np.zeros(number_samples)) df = df.assign(duration=gen_d) df = df.assign(bandwidth=np.zeros(number_samples)) df = df.assign(chisq=np.zeros(number_samples)) df = df.assign(chisqDof=np.zeros(number_samples)) df = df.assign(confidence=np.zeros(number_samples)) df = df.assign(id=[None] * number_samples) df = df.assign(ifo=['{}'.format(IFO)] * number_samples) df = df.assign(label=['null'] * number_samples) df = df.assign(imgUrl=[None] * number_samples) output_path = os.path.join(trigger_dir, 'RandomSample.csv') df.to_csv(output_path) # save the data sample df = pd.read_csv(output_path) # load the file df.loc[:, 'label'] = 'null' # this is py3 issue, the label of 'null' is read as NaN return df
[docs]def FindShiftedPoints(state, IFO, Epoch_lt, number_samples, step, outputMother_dir, df_target): ''' description: 1. within an epoch, create a list of synthetic points by shifting a target glitch class 2. make pandas frame dataset USAGE: df = FindRadomlistPoints(state, Epoch_lt, number_trials, step) :param state: IFO state {observing, nominal-lock} :param IFO: an observer {H1, L1} :param Epoch_lt: a list of epochs :param number_samples: number of samples picked up :param step: step of data points in sec :param outputMother_dir: an output directory in witch the data set is in :param df_target: true glitch samples generated by an ETG with SNR above an upper threshold of background :return: df: synthetic random data points within an epoch with durations generated from a distribution of a target glitch ''' # -----------------------------------------------------------------------------------------------------+ # get a list of start and end GPS times of an epoch with a given state {'observing', 'nominal-lock'} | # use IdentifyGlitch.FindObservingTimeSegments() | # -----------------------------------------------------------------------------------------------------+ # get a glitch identifier object indentify = IdentifyGlitch() # get start time and end time of an epoch startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1] # get observing segments SegmentsMat, trigger_dir = indentify.FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state) if state == 'nominal-lcok': SegmentsMat[:,0] = SegmentsMat[:, 0] + 300 # remove 5 min after the observing mode begins SegmentsMat[:,1] = SegmentsMat[:, 1] - 60 # remove 1 min before the observing mode ends elif state == 'observing': pass df_target = df_target.assign(GPStime=df_target.GPStime.values + 10) # intentionally shift the trigger times by 10 s df_target = df_target.assign(snr=[0]*df_target.shape[0]) df_target = df_target.assign(label=['null'] * df_target.shape[0]) df_target = df_target.assign(id=[None] * df_target.shape[0]) df_target = df_target.assign(imgUrl=[None] * df_target.shape[0]) df_list = [] for Seg_start, Seg_end in SegmentsMat: df_list.append(df_target[(df_target.GPStime >= Seg_start) & (df_target.GPStime <= Seg_end)]) df = pd.concat(df_list) df = df.reset_index(drop=True) output_path = os.path.join(trigger_dir, 'RandomSample.csv') df.to_csv(output_path) # save the data sample df = pd.read_csv(output_path) # load the file return df
[docs]def FindBGlist(state, number_trials, step, outputMother_dir, df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf, flag='Both'): ''' description: 1. load Gravity Spy data set (.csv file) 2. get the data about the target glitch class 3. get the subset of the target glitches based on SNR and confidence level threshold a user defines 4. accept glitches where their back ground segment do not coincide with any other glitches 5. return the info of the accepted glitches USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, flag) :param df: GravitySpy meta data in pandas format :param Epochstart: starting time of an epoch :param Epochend: end time of an epoch :param Commissioning_lt: commissioning time in list :param TargetGlitchClass: a target glitch class name (str) :param IFO: a type of interferometer (H1, L1, V1) (str) :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int :param Confidence_thre: a threshold of confidence level (float or int ) :param UpperDurationThresh: an upper bound of duration in sec (float or int) :param LowerDurationThresh: a lower bound of duration in sec (float or int) :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default :param gap: a time gap between the target and the background segments in sec, 1 sec in default :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches :return: the list of parameters of glitches passing the above thresholds Listsegments contains of ListIndexSatisfied: a list of index of glitches Listtarget_timeseries_start: a list of a target glitch starting time Listtarget_timeseries_end: a list of a target glitch ending time Listpre_background_start: a list of preceding background starting time Listpre_background_end; a list of preceding background ending time Listfol_background_start: a list of following background starting time Listfol_background_end: a list of following background ending time Listgpstime: a list of GPS times Listduration: a list of durations ListSNR: a list of SNRs Listconfi: a list of confidence levels ListID: a list of IDs ''' df_list = [] # take the data in an Epoch for Epoch in Epoch_lt: StartT, EndT = Epoch[0], Epoch[1] df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)] df_list.append(df_epoch) df_int = pd.concat(df_list) # data set containing only a target glitch class try: # gravity spy and omicron trigger mode df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] except KeyError: # pyCBC mode as there is no column "peakFreq" df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)] # generate synthetic data points df_random = FindRadomlistPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre]) #df_random = FindShiftedPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre]) df_int = df_int.append(df_random, ignore_index=True) # append the dataset of random points #### improvement for duration 2019-07-02 ############# #### full durations are too long for capturing the glitch signature so that the solution is to choose T90 ### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100 ### This function is for getting quite segments so that I concervatively choose the longest fraction of duration, in default FractionEnergy = 1 # (100% of energy ) # note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90 df_int = df_int.assign(gap=gap + (1 - FractionEnergy) / 2. * df_int['duration']) df_int['duration'] = np.round(FractionEnergy * df_int['duration'], 3) # replace full durations (T100) with T90 # make the time segments of glitches measured # I reserve duration +/- gap around a GPS time of a glitch if UserDefinedDuration > 0: # accept a user defined duration # assign this user defined duration for any types of glitches observed in a given IFO df_int.loc[(df_int['ifo'] == IFO), ['duration']] = UserDefinedDuration else: # otherwise, keep the duration that are determined by an event trigger generator (ETG) pass df_int = df_int.assign(reserved_time_start=df_int['GPStime'] - df_int['duration'] / 2. - df_int['gap']) df_int = df_int.assign(reserved_time_end=df_int['GPStime'] + df_int['duration'] / 2. + df_int['gap']) # data set of randomly chosen background df_glitch = df_int[df_int.label == 'null'] # synthetic background trigger has SNR of zero # exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background df_int = df_int[df_int['snr'] > BGSNR_thre] print('all the glitches in the class: {}'.format(df_glitch.shape[0])) # the list of values satisfied as secure ListIndexSatisfied = [] Listtarget_timeseries_start = [] Listtarget_timeseries_end = [] Listpre_background_start = [] Listpre_background_end = [] Listfol_background_start = [] Listfol_background_end = [] Listgpstime = [] Listduration = [] ListSNR = [] Listconfi = [] ListID = [] for ind in range(len(df_glitch)): # target glitch's peak GPS time and the duration gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time duration = df_glitch['duration'].iloc[ind] # # a duration SNR = df_glitch['snr'].iloc[ind] # snr confi = df_glitch['confidence'].iloc[ind] # confidence level ID = df_glitch['id'].iloc[ind] # ID gap_individual = df_glitch['gap'].iloc[ind] if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec pass # go to the next line else: # if the duration of a glitch is less than 0.01 sec continue # go to the next iteration # the starting and ending time of the target glitch # at this moment I assume that the target glitch is distributed evenly around the GPS time target_timeseries_start = gpstime - duration / 2. target_timeseries_end = target_timeseries_start + duration # take the background # the background is around the target glitch # preceding background, which is a nearest neighbor segment before target_timeseries_start # following background, which is a nearest neighbor segment after target_timeseries_end background_duration = duration # duration of both the proceeding and following background pre_background_start = target_timeseries_start - gap_individual - background_duration pre_background_end = pre_background_start + background_duration # target_timeseries_start - gap_individual fol_background_start = target_timeseries_end + gap_individual fol_background_end = fol_background_start + background_duration # target_timeseries_end + gap_individual + background_duration # security check of the preceding background # case 1 is where the background is coincident with the right edge of the reserved segment # case 2 is where the background is coincident with the left edge of the reserved segment # case 3 is where the reserved segment is inside of the background # case 4 is where the background is inside of the reserved segment # it violates the security of the background estimation if at least one of the cases occurs # i.e., pro_case1, 2, 3, and 4 should be zero pre_case1 = len(df_int[(df_int['reserved_time_end'] > pre_background_start) & ( df_int['reserved_time_end'] < pre_background_end)]) # case 1 pre_case2 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & ( df_int['reserved_time_start'] < pre_background_end)]) # case 2 pre_case3 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & ( df_int['reserved_time_end'] < pre_background_end)]) # case 3 pre_case4 = len(df_int[(df_int['reserved_time_start'] < pre_background_start) & ( df_int['reserved_time_end'] > pre_background_end)]) # case 4 # security check of the following background fol_case1 = len(df_int[(df_int['reserved_time_end'] > fol_background_start) & ( df_int['reserved_time_end'] < fol_background_end)]) # case 1 fol_case2 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & ( df_int['reserved_time_start'] < fol_background_end)]) # case 2 fol_case3 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & ( df_int['reserved_time_end'] < fol_background_end)]) # case 3 fol_case4 = len(df_int[(df_int['reserved_time_start'] < fol_background_start) & ( df_int['reserved_time_end'] > fol_background_end)]) # case 4 if flag == 'Both': # take both preceding and following back grounds if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0 and fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: try: # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0] if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_pre = 10 * duration # replace it with the 10 time of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive pre_background_start = pre_background_end - neareset_gap_pre # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass try: # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0] if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_fol = 10 * duration # replace it with the 10 time of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive fol_background_end = fol_background_start + neareset_gap_fol # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass # #print('{:.1f}, {:.1f}, {:.1f}'.format(neareset_gap_pre, neareset_gap_fol, duration)) print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration)) if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(fol_background_end - fol_background_start, 3) < round(duration, 3): sys.exit() ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: continue if flag == 'Either': # accept if either a preceding or a following background is available if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0: if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where both a proceeding and a following are available try: # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0] if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_pre = 10 * duration # replace it with the 10 times of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive pre_background_start = pre_background_end - neareset_gap_pre # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass try: # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0] if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_fol = 10 * duration # replace it with the 10 times of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive fol_background_end = fol_background_start + neareset_gap_fol # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: # the case where only a proceeding is available try: # smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0] if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_pre = 10 * duration # replace it with the 10 times of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive pre_background_start = pre_background_end - neareset_gap_pre # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(pre_background_start) Listpre_background_end.append(pre_background_end) Listfol_background_start.append(0) # fill zero Listfol_background_end.append(0) # fill zero Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where only a following is available try: # smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0] if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise neareset_gap_fol = 10 * duration # replace it with the 10 times of the duration else: pass if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive fol_background_end = fol_background_start + neareset_gap_fol # replace else: pass except IndexError: # there is no nearest glitch that satisfies the above condition pass ListIndexSatisfied.append(ind) # append the index satisfied as secure Listtarget_timeseries_start.append(target_timeseries_start) Listtarget_timeseries_end.append(target_timeseries_end) Listpre_background_start.append(0) # fill zero Listpre_background_end.append(0) # fill zero Listfol_background_start.append(fol_background_start) Listfol_background_end.append(fol_background_end) Listgpstime.append(gpstime) Listduration.append(duration) ListSNR.append(SNR) Listconfi.append(confi) ListID.append(ID) else: # the case where both are NOT available print('Both a preceding and following are NOT available') continue # zipped list of segments Listsegments = list( zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start, Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration, ListSNR, Listconfi, ListID)) return Listsegments
[docs]def TimeShiftingSamplePrecedingBGonly(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'): ''' description: 1. load Gravity Spy data set (.csv file) 2. get the data about the target glitch class 3. get the subset of the target glitches based on SNR and confidence level threshold a user defines 4. accept glitches where their back ground segment do not coincide with any other glitches 5. return the info of the accepted glitches USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, gap, flag) :param df: GravitySpy meta data in pandas format :param Epochstart: starting time of an epoch :param Epochend: end time of an epoch :param Commissioning_lt: commissioning time in list :param TargetGlitchClass: a target glitch class name (str) :param IFO: a type of interferometer (H1, L1, V1) (str) :param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int :param targetSNR_thre: a lower threshold of SNR for target glitches, float or int :param Confidence_thre: a threshold of confidence level (float or int ) :param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default :param gap: a time gap between the target and the background segments in sec, 1 sec in default :param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time :param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches :param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches :param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches :param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches :return: the list of parameters of glitches passing the above thresholds Listsegments contains of ListIndexSatisfied: a list of index of glitches Listtarget_timeseries_start: a list of a target glitch starting time Listtarget_timeseries_end: a list of a target glitch ending time Listpre_background_start: a list of preceding background starting time Listpre_background_end; a list of preceding background ending time Listfol_background_start: a list of following background starting time Listfol_background_end: a list of following background ending time Listgpstime: a list of GPS times Listduration: a list of durations ListSNR: a list of SNRs Listconfi: a list of confidence levels ListID: a list of IDs ''' # get start time and end time of an epoch startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1] Number_samples = int(endT - startT) pre_background_start_initial = startT Target_duration = 1 BG_duration = 10 pre_background_end_initial = pre_background_start_initial + BG_duration target_timeseries_start_initial = pre_background_end_initial # the list of values satisfied as secure ListIndexSatisfied = range(Number_samples) Listtarget_timeseries_start = [target_timeseries_start_initial + Target_duration * ind for ind in ListIndexSatisfied] Listtarget_timeseries_end = [target_timeseries_start + Target_duration for target_timeseries_start in Listtarget_timeseries_start] Listpre_background_start = [pre_background_start_initial + Target_duration * ind for ind in ListIndexSatisfied] Listpre_background_end = [pre_background_start + BG_duration for pre_background_start in Listpre_background_start] Listfol_background_start = [0 for ind in ListIndexSatisfied] Listfol_background_end = [0 for ind in ListIndexSatisfied] Listgpstime = [target_timeseries_start + 0.5 * Target_duration for target_timeseries_start in Listtarget_timeseries_start] Listduration = [1 for ind in ListIndexSatisfied] ListSNR = [10 for ind in ListIndexSatisfied] Listconfi = [0 for ind in ListIndexSatisfied] ListID = ['None' for ind in ListIndexSatisfied] # zipped list of segments Listsegments = list( zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start, Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration, ListSNR, Listconfi, ListID)) return Listsegments
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #take time series of safe channels and calculate the absolute values of whitened FFT #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
[docs]def select_some_trials(iterate, maximum_iterate=5): ''' description: this function is used to reduce the number of trials for the off-source window USAGE: list_index_trials = select_some_trials(iterate, maximum_iterate=5) :param iterate: a number of trials :param maximum_iterate: a maximum number of trials :return: a randomly chosen trials where the total number is maximum_iteration or less ''' if iterate > maximum_iterate: list_index_trials = np.random.choice(range(iterate), maximum_iterate, replace=False) else: list_index_trials = np.arange(iterate).astype(int) return list_index_trials
[docs]def Multiprocess_whitening(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end): ''' description: This is used for multi processing for whitening segments :param full_timeseries: time series comprising target and BGs :param target_timeseries_start: a start time of a target segment :param target_timeseries_end: an end time of a target segment :param pre_background_start: a start time of a preceding BG :param pre_background_start: an end time of a preceding BG :param pre_background_start: a start time of a following BG :param pre_background_start: an end time of a following BG :return: whitened_fft_target: whitened fft of a target segment whitened_fft_PBG: whitened fft of a preceding segment whitened_fft_FBG: whitened fft of a following segment sample_rate: sampling rate of this channel DURATION: a duration of a target segment ''' sample_rate = full_timeseries.sample_rate.value DURATION = target_timeseries_end - target_timeseries_start fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number # revise the duration to make an error in whitening process not not to occur # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000) if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point pass # keep the original duration as it is DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point pre_background_duration = pre_background_end - pre_background_start fol_background_duration = fol_background_end - pre_background_end pre_background_asd_weight = pre_background_duration / (pre_background_duration + fol_background_duration) fol_background_asd_weight = fol_background_duration / (pre_background_duration + fol_background_duration) full_timeseries_start = full_timeseries.span[0] full_timeseries_end = full_timeseries.span[1] del pre_background_duration, fol_background_duration if pre_background_start != 0 and fol_background_start != 0: # if both BGs are available # retrieve the preceding and following backgrounds if pre_background_start < full_timeseries_start: # if the start time of the preceding BG is before that of the full time series pre_background = full_timeseries.crop(end=pre_background_end) # only specify the end time of cropping else: # otherwise pre_background = full_timeseries.crop(start=pre_background_start, end=pre_background_end) # specify both if fol_background_end > full_timeseries_end: # if the end time of the following BG is after that of the full time series fol_background = full_timeseries.crop(start=fol_background_start) # specify only the stat time of cropping else: # otherwise fol_background = full_timeseries.crop(start=fol_background_start, end=fol_background_end) # specify both # average background ASD try: pre_background_asd = pre_background.asd(DURATION, method='welch') # ASD of the preceding BG fol_background_asd = fol_background.asd(DURATION, method='welch') # ASD of the following BG except ValueError: raise ValueError ave_background_asd = pre_background_asd_weight*pre_background_asd + fol_background_asd_weight*fol_background_asd # weighed average of those two BG ASD ave_background_asd = ave_background_asd.value # convert numpy array del pre_background_asd_weight, fol_background_asd_weight # for the preceding BG if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment # number of BG trials iterate = int(pre_background.duration.value / DURATION) # redefine the start and end times of the preceding BG to avoid cropping warning pre_background_start, pre_background_end = pre_background.span[0], pre_background.span[1] ## --------cropping from the starting time------ # take some fraction of trials list_index_trials = select_some_trials(iterate) crop_start_list = [pre_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments # avoid the cropping warning crop_end_list = [crop_end if crop_end < pre_background_end else pre_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it # calculate each whitened FFT whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)] del crop_start_list, crop_end_list # delete redundant lists whitened_fft_PBG_forwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None) del whitened_fft_PBG_ind_list # delete redundant a list ## -----cropping from the ending time------ # take some fraction of trials list_index_trials = select_some_trials(iterate) crop_end_list = [pre_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments # avoid the cropping warning crop_start_list = [crop_start if crop_start > pre_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)] del crop_start_list, crop_end_list whitened_fft_PBG_backwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None) del whitened_fft_PBG_ind_list # delete redundant a list whitened_fft_PBG = np.append(whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding) del whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding # delete redundant a list else: # if the preceding BG duration is the same as that of the target segment fft_PBG = pre_background.asd(DURATION, method='welch').value # whitening the preceding BG using the average BG whitened_fft_PBG = fft_PBG / ave_background_asd # for the following BG if fol_background.duration.value > DURATION: # if the following BG duration is longer than that of the target segment # number of BG trials iterate = int(fol_background.duration.value / DURATION) # redefine the start and end times of the following BG to avoid cropping warning fol_background_start, fol_background_end = fol_background.span[0], fol_background.span[1] ## ------ cropping from the starting time ------ # take some fraction of trials list_index_trials = select_some_trials(iterate) crop_start_list = [fol_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments # avoid the cropping warning crop_end_list = [crop_end if crop_end < fol_background_end else fol_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger # note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it # calculate each whitened FFT whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)] del crop_start_list, crop_end_list # delete redundant lists whitened_fft_FBG_forwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None) del whitened_fft_FBG_ind_list # delete redundant a list ## ------ cropping from the ending time ----- # take some fraction of trials list_index_trials = select_some_trials(iterate) crop_end_list = [fol_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments # avoid the cropping warning crop_start_list = [crop_start if crop_start > fol_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller # note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)] del crop_start_list, crop_end_list whitened_fft_FBG_backwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None) del whitened_fft_FBG_ind_list # delete redundant a list whitened_fft_FBG = np.append(whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding) del whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding # delete redundant a list else: # if the following BG duration is the same as that of the target segment fft_FBG = fol_background.asd(DURATION, method='welch').value # whitening the following BG using the average BG whitened_fft_FBG = fft_FBG / ave_background_asd elif pre_background_start != 0 and fol_background_start == 0: # if there is an available preceding background # retrieve the preceding and following backgrounds if pre_background_start < full_timeseries_start: pre_background_start = full_timeseries_start pre_background = full_timeseries.crop(start=pre_background_start, end=pre_background_end) # fft of a preceding BG try: pre_background_asd = pre_background.asd(DURATION) # ASD of the preceding BG except ValueError: raise ValueError ave_background_asd = pre_background_asd # average of those two BG ASD ave_background_asd = ave_background_asd.value # convert it to numpy array # for the preceding BG if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment # number of BG trials iterate = int(pre_background.duration.value / DURATION) # redefine the start and end times of the preceding BG to avoid cropping warning pre_background_start, pre_background_end = pre_background.span[0], pre_background.span[1] ## ----- cropping from the starting time ----- # take some fraction of trials list_index_trials = select_some_trials(iterate) crop_start_list = [pre_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments # avoid the cropping warning crop_end_list = [crop_end if crop_end < pre_background_end else pre_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it # calculate each whitened FFT whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)] del crop_start_list, crop_end_list # delete redundant lists whitened_fft_PBG_forwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None) del whitened_fft_PBG_ind_list # delete redundant a list ## ---- cropping from the ending time -------- # take some fraction of trials list_index_trials = select_some_trials(iterate) crop_end_list = [pre_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments # avoid the cropping warning crop_start_list = [crop_start if crop_start > pre_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller # note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)] del crop_start_list, crop_end_list whitened_fft_PBG_backwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None) del whitened_fft_PBG_ind_list # delete redundant a list whitened_fft_PBG = np.append(whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding) del whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding else: # if the preceding BG duration is the same as that of the target segment fft_PBG = pre_background.asd(DURATION, method='welch').value # whitening the preceding BG using the average BG whitened_fft_PBG = fft_PBG / ave_background_asd whitened_fft_FBG = None # it is not calculated elif pre_background_start == 0 and fol_background_start != 0: # if there is an available following background # retrieve the preceding and following backgrounds if fol_background_end > full_timeseries_end: fol_background_end = full_timeseries_end fol_background = full_timeseries.crop(start=fol_background_start, end=fol_background_end) try: fol_background_asd = fol_background.asd(DURATION) # ASD of the following BG except ValueError: raise ValueError ave_background_asd = fol_background_asd # average of those two BG ASD ave_background_asd = ave_background_asd.value # convert it to numpy array whitened_fft_PBG = None # it is not calculated if fol_background.duration.value > DURATION: # if the following BG duration is longer than that of the target segment # number of BG trials iterate = int(fol_background.duration.value / DURATION) # redefine the start and end times of the following BG to avoid cropping warning fol_background_start, fol_background_end = fol_background.span[0], fol_background.span[1] ## ----- cropping from the starting time ------ # take some fraction of trials list_index_trials = select_some_trials(iterate) crop_start_list = [fol_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments # avoid the cropping warning crop_end_list = [crop_end if crop_end < fol_background_end else fol_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger # note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it # calculate each whitened FFT whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)] del crop_start_list, crop_end_list # delete redundant lists whitened_fft_FBG_forwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None) del whitened_fft_FBG_ind_list # delete redundant a list ## ------ cropping from the ending time -------- # take some fraction of trials list_index_trials = select_some_trials(iterate) crop_end_list = [fol_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments # avoid the cropping warning crop_start_list = [crop_start if crop_start > fol_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller # note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)] del crop_start_list, crop_end_list whitened_fft_FBG_backwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None) del whitened_fft_FBG_ind_list # delete redundant a list whitened_fft_FBG = np.append(whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding) del whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding else: # if the following BG duration is the same as that of the target segment fft_FBG = fol_background.asd(DURATION, method='welch').value # whitening the following BG using the average BG whitened_fft_FBG = fft_FBG / ave_background_asd # whitening Target target_timeseries = full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end) fft_target = target_timeseries.asd(DURATION, method='welch').value # fft of the target # whitening the target using the average BG whitened_fft_target = fft_target / ave_background_asd return whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, DURATION
[docs]def Multiprocess_whitening_timeseries(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end): ''' description: This is used for multi processing for whitening segments and puts out the absolute values of whitened timeseries of the on- and off-source windows instead of putting out of the frequency series. The the deviation of the whitened timeseries is imformative so that it calculates absolute values This function is used to study the metric evaluated in the time domain compared with the metric evaluated in the frequency domain. In conclusion, the metric in the frequency domain is better so that this function is useless. USAGE: whitened_target_timeseries_abs, whitened_pre_off_source_abs, whitened_fol_off_source_abs, sample_rate, DURATION = Multiprocess_whitening_timeseries(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end) :param full_timeseries: time series comprising target and BGs :param target_timeseries_start: a start time of a target segment :param target_timeseries_end: an end time of a target segment :param pre_background_start: a start time of a preceding BG :param pre_background_start: an end time of a preceding BG :param pre_background_start: a start time of a following BG :param pre_background_start: an end time of a following BG :return: whitened_target_timeseries_abs: the absolute values of whitened timeseries in the on-source window whitened_pre_off_source_abs: the absolute values of whitened timeseries in the preceding off-source window whitened_fol_off_source_abs: the absolute values of whitened timeseries in the preceding off-source window sample_rate: sampling rate of this channel DURATION: a duration of a target segment ''' sample_rate = full_timeseries.sample_rate.value DURATION = target_timeseries_end - target_timeseries_start fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number # revise the duration to make an error in whitening process not not to occur # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000) if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point pass # keep the original duration as it is DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point pre_background_duration = pre_background_end - pre_background_start fol_background_duration = fol_background_end - pre_background_end pre_background_asd_weight = pre_background_duration / (pre_background_duration + fol_background_duration) fol_background_asd_weight = fol_background_duration / (pre_background_duration + fol_background_duration) pre_background_duration, fol_background_duration # ----------------- try: pre_timeseries = full_timeseries.crop(end=pre_background_end) fol_timeseries = full_timeseries.crop(start=fol_background_start) pre_background_asd = pre_timeseries.asd(DURATION, method='welch') # ASD of the preceding BG fol_background_asd = fol_timeseries.asd(DURATION, method='welch') # ASD of the following BG except ValueError: raise ValueError ave_background_asd = pre_background_asd_weight * pre_background_asd + fol_background_asd_weight * fol_background_asd # weighed average of those two BG ASD del pre_background_asd_weight, fol_background_asd_weight whitened_full_timeseries = full_timeseries.whiten(fftlength=DURATION, asd=ave_background_asd) #--------------------- #whitened_full_timeseries = full_timeseries.whiten(fftlength=DURATION) if pre_background_start != 0 and fol_background_start != 0: # if both BGs are available ################3 whitened_pre_off_source = whitened_full_timeseries.crop(start=pre_background_start + 0.05 * pre_background_duration , end=pre_background_end) whitened_fol_off_source = whitened_full_timeseries.crop(start=fol_background_start, end=fol_background_end - 0.05 * fol_background_duration) # take absolute value whitened_pre_off_source_abs = whitened_pre_off_source.abs().value whitened_fol_off_source_abs = whitened_fol_off_source.abs().value elif pre_background_start != 0 and fol_background_start == 0: # if there is an available preceding background whitened_pre_off_source = whitened_full_timeseries.crop(start=pre_background_start + 0.05 * pre_background_duration, end=pre_background_end) whitened_fol_off_source = None # take absolute value whitened_pre_off_source_abs = whitened_pre_off_source.abs().value elif pre_background_start == 0 and fol_background_start != 0: # if there is an available following background whitened_pre_off_source = None whitened_fol_off_source = whitened_full_timeseries.crop(start=fol_background_start, end=fol_background_end - 0.05 * fol_background_duration) # take absolute value whitened_fol_off_source_abs = whitened_fol_off_source.abs().value # whitening Target whitened_target_timeseries = whitened_full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end) # take absolute value whitened_target_timeseries_abs = whitened_target_timeseries.abs().value return whitened_target_timeseries_abs, whitened_pre_off_source_abs, whitened_fol_off_source_abs, sample_rate, DURATION
# # def Multiprocess_subtracting(target_timeseries, pre_background, fol_background): # ''' # description: # This is used for multi processing for subtracting segments # :param ch_label: channel label # :param target_timeseries: time series of a target segment # :param pre_background: time series of a preceding BG # :param fol_background: time sereis of a following BG # :param pre_background_start: start time of a preceding BG # :param pre_background_end: end time of a preceding BG # :param fol_background_start: start time of a following BG # :return: whitened_fft_target: whitened fft of a target segment # whitened_fft_PBG: whitened fft of a preceding segment # whitened_fft_FBG: whitened fft of a following segment # sample_rate: sampling rate of this channel # DURATION: a duration of a target segment # ''' # sample_rate = target_timeseries.sample_rate.value # DURATION = target_timeseries.duration.value # fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number # DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number # # revise the duration to make an error in whitening process not not to occur # # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different # third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000) # if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point # DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point # elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point # pass # keep the original duration as it is # DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point # pre_background_start = pre_background.span[0] # start time of a preceding BG # pre_background_end = pre_background.span[1] # end time of a preceding BG # fol_background_start = fol_background.span[0] # start time of a following BG # fol_background_end = fol_background.span[1] # end time of a following BG # if pre_background_start != 0 and fol_background_start != 0: # if both BGs are available # # average background ASD # try: # pre_background_fft = pre_background.average_fft(DURATION, window='hanning').abs()# averaged fft of the preceding BG # fol_background_fft = fol_background.average_fft(DURATION, window='hanning').abs() # averaged fft of the following BG # except ValueError: # raise ValueError # ave_background_fft = (pre_background_fft + fol_background_fft) / 2. # average of those two BG ASD # # for the preceding BG # if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment # res_fft_PBG = np.array([]) # residual # # number of BG trials # iterate = int(pre_background.duration.value / DURATION) # for i in range(iterate): # iterate though the number of trials # # iterate from the starting time of the preceding BG # crop_start = pre_background_start + i * DURATION # start time of the cropping # crop_end = crop_start + DURATION # ending time of of the cropping # fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it # res_fft_PBG_ind = fft_PBG - ave_background_fft # calcurate a residual of the cropped BG from the averaged BG # res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array # # iterate from the ending time of the preceding BG # crop_end = pre_background_end - i * DURATION # ending time of of the cropping # crop_start = crop_end - DURATION # start time of the cropping # fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it # res_fft_PBG_ind = fft_PBG - ave_background_fft # calcurate a residual of the cropped BG from the averaged BG # res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array # else: # if the preceding BG duration is the same as that of the target segment # fft_PBG = pre_background.crop(start=pre_background_end - DURATION).average_fft(DURATION, window='hanning').abs() # # whitening the preceding BG using the average BG # res_fft_PBG = fft_PBG - ave_background_fft # # for the following BG # if fol_background.duration.value > DURATION: # if the following BG duration is longer than that of the target segment # res_fft_FBG = np.array([]) # # number of BG trials # iterate = int(fol_background.duration.value / DURATION) # for i in range(iterate): # iterate though the number of trials # # iterate from the starting time of the following BG # crop_start = fol_background_start + i * DURATION # start time of the cropping # crop_end = crop_start + DURATION # ending time of of the cropping # fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it # res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG using the averaged BG # res_fft_FBG = np.append(res_fft_FBG, res_fft_FBG_ind) # append to the whole trials array # # iterate from the ending time of the following BG # crop_end = fol_background_end - i * DURATION # ending time of of the cropping # crop_start = crop_end - DURATION # start time of the cropping # fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it # res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG using the averaged BG # res_fft_FBG = np.append(res_fft_FBG, res_fft_FBG_ind) # append to the whole trials array # else: # if the following BG duration is the same as that of the target segment # fft_FBG = fol_background.crop(start=fol_background_start, end=fol_background_start + DURATION).average_fft(DURATION, window='hanning').abs() # # whitening the following BG using the average BG # res_fft_FBG = fft_FBG - ave_background_fft # elif pre_background_start != 0 and fol_background_start == 0: # if there is an available proceeding background # # fft of a preceding BG # try: # pre_background_fft = pre_background.average_fft(DURATION, window='hanning').abs() # fft of the preceding BG # except ValueError: # raise ValueError # ave_background_fft = pre_background_fft # average of those two BG ASD # # for the preceding BG # if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment # res_fft_PBG = np.array([]) # # number of BG trials # iterate = int(pre_background.duration.value / DURATION) # for i in range(iterate): # iterate though the number of trials # # iterate from the starting time of the preceding BG # crop_start = pre_background_start + i * DURATION # start time of the cropping # crop_end = crop_start + DURATION # ending time of of the cropping # fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it # res_fft_PBG_ind = fft_PBG - ave_background_fft # calculate a residual of the cropped BG from the averaged BG # res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array # # iterate from the ending time of the preceding BG # crop_end = pre_background_end - i * DURATION # ending time of of the cropping # crop_start = crop_end - DURATION # start time of the cropping # fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop it # res_fft_PBG_ind = fft_PBG - ave_background_fft # calculate a residual of the cropped BG using the averaged BG # res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array # else: # if the preceding BG duration is the same as that of the target segment # fft_PBG = pre_background.crop(start=pre_background_end - DURATION).average_fft(DURATION, window='hanning').abs() # # whitening the preceding BG using the average BG # res_fft_PBG = fft_PBG - ave_background_fft # whitened_fft_FBG = None # it is not calculated # elif pre_background_start == 0 and fol_background_start != 0: # if there is an available proceeding background # try: # fol_background_fft = fol_background.average_fft(DURATION, window='hanning').abs() # fft of the following BG # except ValueError: # raise ValueError # ave_background_fft = fol_background_fft # average of those two BG ASD # whitened_fft_PBG = None # it is not calculated # if fol_background.duration.value > DURATION: # if the following BG duration is longer than that of the target segment # res_fft_FBG = np.array([]) # # number of BG trials # iterate = int(fol_background.duration.value / DURATION) # for i in range(iterate): # iterate though the number of trials # # iterate from the starting time of the following BG # crop_start = fol_background_start + i * DURATION # start time of the cropping # crop_end = crop_start + DURATION # ending time of of the cropping # fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop it # res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG from the averaged BG # res_fft_FBG = np.append(res_fft_FBG, res_fft_FBG_ind) # append to the whole trials array # # iterate from the ending time of the following BG # crop_end = fol_background_end - i * DURATION # ending time of of the cropping # crop_start = crop_end - DURATION # start time of the cropping # fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop it # res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG from the averaged BG # whitened_fft_FBG = np.append(whitened_fft_FBG, res_fft_FBG_ind) # append to the whole trials array # else: # if the following BG duration is the same as that of the target segment # fft_FBG = fol_background.crop(start=fol_background_start, end=fol_background_start + DURATION).average_fft(DURATION, window='hanning').abs() # # whitening the following BG using the average BG # res_fft_FBG = fft_FBG - ave_background_fft # # whitening Target # fft_target = target_timeseries.average_fft(DURATION, window='hanning').abs() # fft of the target # # whitening the target using the average BG # res_fft_target = fft_target - ave_background_fft # return res_fft_target, res_fft_PBG, res_fft_FBG, sample_rate, DURATION
[docs]def SaveTargetAndBackGroundHDF5_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False'): ''' description: THIS IS USED FOR "OFFLINE" MODE 0. assuming Listsegments is given by Findglitchlist() 1.take the information of the list of allowed target and a preceding and following segments 2. whiten a target segmement based on the average background segment 3. find whitened FFT 4. save the whitened target and backgrounds FFTs Note this depends on USAGE: SaveTargetAndBackGroundHDF5(Listsegments, re_sfchs, IFO, outputpath, outputfilename, mode=='offline') :param Listsegments: a list of segment parameters :param channels: a list of safe channels :param outputpath: a directory of an output file :param outputfilename: a name of an output file :param number_process: a number of processes in parallel :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default ''' # check if the output directory exists if os.path.isdir(outputpath) is False: print("The directory for the output HDF5 file does not exist: path = {}".format(outputpath)) print("Please specify the existing directory for it.") sys.exit() # the list of the safe channels channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs] # the path and the file name outputPathFileName = os.path.join(outputpath, outputfilename) # make an output file try: f = h5py.File(outputPathFileName, 'w-') f.close() except IOError: print('The file already exists in {}'.format(outputPathFileName)) print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it') sys.exit() # open a HDF5 file in writing mode with h5py.File(outputPathFileName, 'r+') as f: for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments: t0 = time.time() try: # deal with time series in bad state if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) else: pass channels_used = channels if PlusHOFT == 'True': # get the data of hoft if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) else: pass full_timeseriesDict.append(full_timeseriesDict_hoft) # append del full_timeseriesDict_hoft # delete the hoft object channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO)) # ===================================================== except IndexError: print('Bad data, go to a next GPS') continue # go to a next GPS time except RuntimeError: print('RuntimeError, go to a next GPS') continue except IOError: # error when getting time series print('IOError, go to a next GPS') continue except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ] print('ValueError, go to a next GPS') continue g = f.create_group('gps{:05d}'.format(IndexSatisfied)) g.attrs['GPS'] = gpstime # a peak GPS time g.attrs['duration'] = duration g.attrs['snr'] = SNR # snr g.attrs['confidence'] = confi # confidence g.attrs['id'] = ID g.attrs['ifo'] = channels[0].split(':')[0] try: ch_labels = range(len(channels_used)) # a list of channel labels List_whitened_fft_target = [] # a list of whitened fft of target List_whitened_fft_PBG = [] # a list of whitened fft of PBG List_whitened_fft_FBG = [] # a list of whitened fft of FBG List_sample_rate = [] # a list of sample rates List_DURATION = [] # a list of durations # whitened target and BG segments with multi processes with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, DURATION in executor.map(Multiprocess_whitening, full_timeseriesDict.values(), itrtls.repeat(target_timeseries_start), itrtls.repeat(target_timeseries_end), itrtls.repeat(pre_background_start), itrtls.repeat(pre_background_end), itrtls.repeat(fol_background_start), itrtls.repeat(fol_background_end) ): List_whitened_fft_target.append(whitened_fft_target) # fill List_whitened_fft_PBG.append(whitened_fft_PBG) # fill List_whitened_fft_FBG.append(whitened_fft_FBG) # fill List_sample_rate.append(sample_rate) # fill List_DURATION.append(DURATION) # fill for ch_label in ch_labels: # iterate through channel labels dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9) # target dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label] # target dsetTarget.attrs['channel'] = channels_used[ch_label].split(':')[1] # target dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target if List_whitened_fft_PBG[ch_label] != None: # if the preceding BG is available, this condition is required for the "Either" BG selection dsetBG = g.create_dataset("chP{:05d}".format(ch_label), data=List_whitened_fft_PBG[ch_label], compression="gzip", compression_opts=9) # BG dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1] # BG dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG else: # if the preceding BG is NOT available pass if List_whitened_fft_FBG[ch_label] != None: # if the following BG is available, this condition is required for the "Either" BG selection dsetBG = g.create_dataset("chF{:05d}".format(ch_label), data=List_whitened_fft_FBG[ch_label], compression="gzip", compression_opts=9) # BG dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1] # BG dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG else: # if the following BG is NOT available pass except ValueError: del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied)) continue # go to a next GPS time except IOError: # IOError: Driver write request failed del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied)) continue # go to a next GPS time print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied)) t1 = time.time() print('Time taken: {} s'.format(t1 - t0))
[docs]def SaveTargetAndBackGroundHDF5_ONLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT): ''' description: THIS IS USED FOR "ONLINE" MODE 0. assuming Listsegments is given by Findglitchlist() 1.take the information of the list of allowed target and a preceding and following segments 2. whiten a target segmement based on the average background segment 3. find whitened FFT 4. save the whitened target and backgrounds FFTs Note this depends on Multiprocess_whitening() USAGE: SaveTargetAndBackGroundHDF5(Listsegments, re_sfchs, IFO, outputpath, outputfilename, mode=='offline') :param Listsegments: a list of segment parameters :param channels: a list of safe channels :param outputpath: a directory of an output file :param outputfilename: a name of an output file :param number_process: a number of processes in parallel :param PlusHOFT: whether to get data of hoft, {'True' or 'False'} ''' # the list of the safe channels channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs] # the path and the file name outputPathFileName = os.path.join(outputpath, outputfilename) # make an output file and open it try: f = h5py.File(outputPathFileName, 'w-') f.close() except IOError: print('The file already exists in {}'.format(outputPathFileName)) print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it') sys.exit() with h5py.File(outputPathFileName, 'r+') as f: for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments: t0 = time.time() # extracting the target data and the proceeding and following backgrounds try: # deal with time series in bad state if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) else: pass channels_used = channels if PlusHOFT == 'True': # get the data of hoft if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) else: pass full_timeseriesDict.append(full_timeseriesDict_hoft) # append del full_timeseriesDict_hoft # delete the hoft object channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO)) except IndexError: # IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099) continue # go to a next GPS time except RuntimeError: continue # go to a next GPS time except IOError: # error when getting time series print('IOError, go to a next GPS') continue except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ] print('ValueError, go to a next GPS') continue g = f.create_group('gps{:05d}'.format(IndexSatisfied)) g.attrs['GPS'] = gpstime # a peak GPS time g.attrs['duration'] = duration g.attrs['snr'] = SNR # snr g.attrs['confidence'] = confi # confidence g.attrs['id'] = ID g.attrs['ifo'] = channels[0].split(':')[0] try: ch_labels = range(len(channels_used)) # a list of channel labels List_whitened_fft_target = [] # a list of whitened fft of target List_whitened_fft_PBG = [] # a list of whitened fft of PBG List_whitened_fft_FBG = [] # a list of whitened fft of FBG List_sample_rate = [] # a list of sample rates List_DURATION = [] # a list of durations # whitened target and BG segments with multi processes with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for whitened_fft_target, whitened_fft_FBG, whitened_fft_PBG, sample_rate, DURATION in executor.map(Multiprocess_whitening, full_timeseriesDict.values(), itrtls.repeat(target_timeseries_start), itrtls.repeat(target_timeseries_end), itrtls.repeat(pre_background_start), itrtls.repeat(pre_background_end), itrtls.repeat(fol_background_start), itrtls.repeat(fol_background_end) ): List_whitened_fft_target.append(whitened_fft_target) # fill List_whitened_fft_PBG.append(whitened_fft_PBG) # fill List_whitened_fft_FBG.append(whitened_fft_FBG) # fill List_sample_rate.append(sample_rate) # fill List_DURATION.append(DURATION) # fill for ch_label in ch_labels: # iterate through channel labels dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9) # target dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label] # target dsetTarget.attrs['channel'] = channels_used[ch_label].split(':')[1] # target dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target if List_whitened_fft_PBG[ch_label] != None: # if the preceding BG is available, this condition is required for the "Either" BG selection dsetBG = g.create_dataset("chP{:05d}".format(ch_label), data=List_whitened_fft_PBG[ch_label], compression="gzip", compression_opts=9) # BG dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1] # BG dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG else: # if the preceding BG is NOT available pass if List_whitened_fft_FBG[ch_label] != None: # if the following BG is available, this condition is required for the "Either" BG selection dsetBG = g.create_dataset("chF{:05d}".format(ch_label), data=List_whitened_fft_FBG[ch_label], compression="gzip", compression_opts=9) # BG dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1] # BG dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG else: # if the following BGfull_timeseriesDict is NOT available pass except ValueError: del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied)) continue except IOError: # IOError: Driver write request failed del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied)) continue # go to a next GPS time yield g # pass a group object as a generator t1 = time.time() print('Time taken: {} s'.format(t1 - t0))
[docs]def SaveTargetAndBackGroundHDF5_TimeShift(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False'): ''' description: THIS IS USED FOR "OFFLINE" MODE 0. assuming Listsegments is given by Findglitchlist() 1.take the information of the list of allowed target and a preceding and following segments 2. whiten a target segmement based on the average background segment 3. find whitened FFT 4. save the whitened target and backgrounds FFTs Note this depends on USAGE: SaveTargetAndBackGroundHDF5(Listsegments, re_sfchs, IFO, outputpath, outputfilename, mode=='offline') :param Listsegments: a list of segment parameters :param channels: a list of safe channels :param outputpath: a directory of an output file :param outputfilename: a name of an output file :param number_process: a number of processes in parallel :param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default ''' # the list of the safe channels channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs] # the path and the file name outputPathFileName = os.path.join(outputpath, outputfilename) # make an output file try: f = h5py.File(outputPathFileName, 'w-') f.close() except IOError: print('The file already exists in {}'.format(outputPathFileName)) print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it') sys.exit() # open a HDF5 file in writing mode, create a file if not exist yet with h5py.File(outputPathFileName, 'r+') as f: for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments: t0 = time.time() try: # deal with time series in bad state if IndexSatisfied == 0: if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) else: pass if PlusHOFT == 'True': # get the data of hoft if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) else: pass full_timeseriesDict.append(full_timeseriesDict_hoft) # append del full_timeseriesDict_hoft # delete the hoft object channels = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO)) previous_full_timeseriesDict = full_timeseriesDict previous_pre_background_start = pre_background_start else: if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process) else: pass if PlusHOFT == 'True': # get the data of hoft if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process) else: pass full_timeseriesDict.append(full_timeseriesDict_hoft) # append del full_timeseriesDict_hoft # delete the hoft object channels = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO)) full_timeseriesDict.prepend(previous_full_timeseriesDict.crop(start=previous_pre_background_start)) previous_full_timeseriesDict = full_timeseriesDict previous_pre_background_start = pre_background_start # ===================================================== except IndexError: print('Bad data, go to a next GPS') continue # go to a next GPS time except RuntimeError: print('RuntimeError, go to a next GPS') continue except IOError: # error when getting time series print('IOError, go to a next GPS') except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ] print('ValueError, go to a next GPS') t2 = time.time() print('Time taken for query: {} s'.format(t2 - t0)) g = f.create_group('gps{:05d}'.format(IndexSatisfied)) g.attrs['GPS'] = gpstime # a peak GPS time g.attrs['duration'] = duration g.attrs['snr'] = SNR # snr g.attrs['confidence'] = confi # confidence g.attrs['id'] = ID g.attrs['ifo'] = channels[0].split(':')[0] try: ch_labels = range(len(channels)) # a list of channel labels List_whitened_fft_target = [] # a list of whitened fft of target List_whitened_fft_PBG = [] # a list of whitened fft of PBG List_whitened_fft_FBG = [] # a list of whitened fft of FBG List_sample_rate = [] # a list of sample rates List_DURATION = [] # a list of durations # whitened target and BG segments with multi processes with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor: for whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, DURATION in executor.map(Multiprocess_whitening, full_timeseriesDict.values(), itrtls.repeat(target_timeseries_start), itrtls.repeat(target_timeseries_end), itrtls.repeat(pre_background_start), itrtls.repeat(pre_background_end), itrtls.repeat(fol_background_start), itrtls.repeat(fol_background_end) ): List_whitened_fft_target.append(whitened_fft_target) # fill List_whitened_fft_PBG.append(whitened_fft_PBG) # fill List_whitened_fft_FBG.append(whitened_fft_FBG) # fill List_sample_rate.append(sample_rate) # fill List_DURATION.append(DURATION) # fill for ch_label in ch_labels: # iterate through channel labels dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9) # target dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label] # target dsetTarget.attrs['channel'] = channels[ch_label].split(':')[1] # target dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target if List_whitened_fft_PBG[ch_label] != None: # if the preceding BG is available, this condition is required for the "Either" BG selection dsetBG = g.create_dataset("chP{:05d}".format(ch_label), data=List_whitened_fft_PBG[ch_label], compression="gzip", compression_opts=9) # BG dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG dsetBG.attrs['channel'] = channels[ch_label].split(':')[1] # BG dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG else: # if the preceding BG is NOT available pass if List_whitened_fft_FBG[ch_label] != None: # if the following BG is available, this condition is required for the "Either" BG selection dsetBG = g.create_dataset("chF{:05d}".format(ch_label), data=List_whitened_fft_FBG[ch_label], compression="gzip", compression_opts=9) # BG dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG dsetBG.attrs['channel'] = channels[ch_label].split(':')[1] # BG dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG else: # if the following BG is NOT available pass except ValueError: del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied)) continue # go to a next GPS time except IOError: # IOError: Driver write request failed del f['gps{:05d}'.format(IndexSatisfied)] print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied)) continue # go to a next GPS time print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied)) t1 = time.time() print('Time taken: {} s'.format(t1 - t0))
#+++++++++++++++++++++++++++++++++++++++++++++++ # plot tools #+++++++++++++++++++++++++++++++++++++++++++++++
[docs]class PlotTableAnalysis(): def __init__(self): self.f = None # a HDF5 file storing a glitch class
[docs] def getHDF5Object(self): ''' show a HDF5 file object USAGE: getHDF5Object() :return: a dictionary ''' if self.f == None: print('A HDF5 file object is not set yet') else: return self.f
[docs] def setHDF5Object(self, input_dir, input_file): ''' set a HDF5 file object :param f_dict: a dictionary of time series data sets :return: None ''' input_path = os.path.join(input_dir, input_file) f_input = h5py.File(input_path, 'r') # read it in the read mode self.f = f_input
[docs] def FindNumberSample(self): ''' description: find the number of samples :return: the number of samples ''' number_sample = len(list(self.f.keys())) return number_sample
[docs] def FindNumberChannels(self, g): ''' description: count a number of channels that are analyzed USAGE: NumberOfChannels = FindNumberChannels(g) :param g: a HDF file group object :return: NumberOfChannels: a number of channels that are analyzed ''' TPF_list = np.array([str(dset)[2] for dset in list(g.keys())]) NumberOfChannels = len(np.where(TPF_list == 'T')[0]) return NumberOfChannels
[docs] def HierarchyChannelAboveThreshold(self, g, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None'): ''' description: calculate values of importance which is the number of frequency bins above a the off-source window for channels for a given time of a glitch USAGE: RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, duration = HierarchyChannelAboveThreshold(g, LowerCutOffFreq, UpperCutOffFreq, sigma=10) :param g: (hdf5 format) a group having a glitch :param pt: a value of area integrating a distribution upto :param sigma: value of standard deviation of ratio of medians to determine important channels :return: RankingChannelAndRatioMed: # a list of channel names with their ratios descended by ratios Importantchannels: # channels with their ratio is greater than the threshold of ratio, along with their ratios ''' SNR = g.attrs['snr'] GPS = g.attrs['GPS'] confidence = g.attrs['confidence'] ID = g.attrs['id'] DURATION = g.attrs['duration'] ListChannelName = np.array([]) ListCount = np.array([]) NumberOfChannels = self.FindNumberChannels(g) for i in range(NumberOfChannels): dsetT = g['chT{:05d}'.format(i)] # a target data set try: # this is required in the condition for "Either" BG selection dsetP = g['chP{:05d}'.format(i)] # a preceding BG data set except KeyError: dsetP = None try: # this is required in the condition for "Either" BG selection dsetF = g['chF{:05d}'.format(i)] # a following BG data set except KeyError: dsetF = None ChannelName = dsetT.attrs['channel'].decode('utf-8') # channel name duration = dsetT.attrs['duration'] # duration sampling_rate = dsetT.attrs['SamplingRate'] # sampling rate of this channel spT = dsetT[()] # target data set pixel value if dsetP != None: # this is required in the condition for "Either" BG selection spP = dsetP[()] # a preceding BG data set pixel value else: spP = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True: if dsetF != None: # this is required in the condition for "Either" BG selection spF = dsetF[()] # a following BG data set pixel value else: spF = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True: #======================= if dsetP != None: Number_PBG_trials = spP.shape[0] / spT.shape[0] spP = spP.reshape(int(Number_PBG_trials), spT.shape[0]) if dsetF != None: Number_FBG_trials = spF.shape[0] / spT.shape[0] spF = spF.reshape(int(Number_FBG_trials), spT.shape[0]) #======================= #++++++ for the case where there is no data measured at a channel +++++++++++++++++++ #++++++ That channel has no contribution to h(t) so that we can define Count = 0 +++++++++++ #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True: # if target and BG have data measured pT = spT #** 2 # power of target if dsetP != None: pP = spP #** 2 # power of a preceding BG if dsetF != None: pF = spF #** 2 # power of a following BG # low, high, or band pass filter if we know which frequency band is important if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch) if sampling_rate / 2 <= LowerCutOffFreq: pT = np.array([0]) pP = np.array([0]) pF = np.array([0]) else: LowerCutOffElement = int(LowerCutOffFreq*duration) if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector pT = pT[LowerCutOffElement:] if dsetP != None: pP = pP[:, LowerCutOffElement:] if dsetF != None: pF = pF[:, LowerCutOffElement:] else: # if LowerCutOffElement is greater than the length of the vector pT = np.array([0]) if dsetP != None: pP = np.array([0]) if dsetF != None: pF = np.array([0]) elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter # even if UpperCutOffElement is greater than the length of the vector is works fine UpperCutOffElement = int(UpperCutOffFreq * duration) pT = pT[:UpperCutOffElement] if dsetP != None: pP = pP[:, :UpperCutOffElement] if dsetF != None: pF = pF[:, :UpperCutOffElement] elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None': # band pass filter ## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch) if sampling_rate / 2 <= LowerCutOffFreq: pT = np.array([0]) pP = np.array([0]) pF = np.array([0]) else: LowerCutOffElement = int(LowerCutOffFreq * duration) UpperCutOffElement = int(UpperCutOffFreq * duration) if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector pT = pT[LowerCutOffElement:UpperCutOffElement] if dsetP != None: pP = pP[:, LowerCutOffElement:UpperCutOffElement] if dsetF != None: pF = pF[:, LowerCutOffElement:UpperCutOffElement] else: # if LowCutOffElement is greater than the length of the vector pT = np.array([0]) if dsetP != None: pP = np.array([0]) if dsetF != None: pF = np.array([0]) else: pass if dsetP != None and dsetF != None: pB = np.concatenate((pP, pF), axis=0) elif dsetP != None and dsetF == None: pB = pP elif dsetP == None and dsetF != None: pB = pF GausCeiling = pB.mean() + sigma * pB.std(ddof=1) # gaussian noise ceiling ########a metric with amplitude ratio ######## #try: # GausCeiling = pB.sum(axis=1).mean() + pB.sum(axis=1).std(ddof=1) #except ValueError: # if pB is [0 0] # GausCeiling = 0 ################################################## #pt = 0.95 #histB, edgB = np.histogram(pB, bins=100, density=True) # histgram of the background #deltaB = edgB[1] - edgB[0] # the length of the bin #cumnumB = np.cumsum(histB * deltaB) # cumulative number #UpperBoundB = edgB[np.where(cumnumB > pt)[0][0]:np.where(cumnumB > pt)[0][0] + 2].mean() # median value # Note that even if pT, pB do not have elements in them, UpperBoundB becomes = 0.0050 so that there is no element in pT above the value as # I set pT = np.array([0]) in this case. Eventually, Count = 0, which is ok #Count = pT[pT > UpperBoundB].shape[0] / float(pT.shape[0]) # a fraction of components above the ceiling ############################ try: Count = pT[pT > GausCeiling].shape[0] / float(pT.shape[0]) # a fraction of components above the ceiling ########### a metric with amplitude ratio ############### #Count = pT.sum() / GausCeiling #if Count < 0: # Count = 0 except ZeroDivisionError: # in the situation where the number of the total elments is zero, I saw GausCeiling is nan. So this channel has mulfunction. I manually set Count = 0 Count = 0 # if the data is flat over the stretch of the segment, the upper bound threshold of BG called "GausCeiling" becomes zero (BG has to flactuate so that "GausCeiling" equal to zero is fake) # if so that importance called "Count" becomes 1. To prevent this fake importance, I set Count = 0 manually if GausCeiling == 0.: Count = 0 else: Count = 0 # manually put zero because this channel has no contribution at this glitch ID ListCount = np.append(ListCount, Count) ListChannelName = np.append(ListChannelName, ChannelName) #+++++++++ find a hierarchy of channels +++++++++++++++++++++ RankingChannelAndCount = [[ListChannelName[i], ListCount[i]] for i in np.argsort(ListCount)[::-1]] # a list of channel names with maximum value of SNR series return RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, DURATION
[docs] def CreateMatCount(self, sigma, g_Individual = None, LowerCutOffFreq='None', UpperCutOffFreq='None'): ''' description: 1. find counts for each glitch 2. stack over all the gliches 3. make a matix comprising importance versus channels USAGE: MatCount, ListChannelName, ListSNR, ListConf, ListGPS, ListDuration = CreateMatCount(sigma, LowerCutOffFreq='None', UpperCutOffFreq='None') # for all glitches MatCount, ListChannelName, ListSNR, ListConf = CreateMatCount(sigma, g_Individual, LowerCutOffFreq='None', UpperCutOffFreq='None') # for an individual glitch :param sigma sigma: an integer to determine the upper bound of the off-source window :param g_Individual: a grounp of a HDF5 file that has values of importance for a gltich :param LowerCutOffFreq: a lower limit frequency cut to calculate a value of importance :param UpperCutOffFreq: an uppper limit frequency cut to calculate a value of importance :return: MatCount: a matrix comprising importance versus channels ListChannelName: a list of channel names ListSNR: a list of SNRs ListConf: a list of confidence dependencies: self.HierarchyChannelAboveThreshold(g, LowerCutOffFreq, UpperCutOffFreq) ''' if g_Individual == None: # for all the glitches in a glitch class MatCount = np.array([]) ListSNR = np.array([]) ListConf = np.array([]) ListGPS = np.array([]) ListDuration = np.array([]) num_dataset = 0 # a number of datasets in a group, set to be 0 as a first instance for g_label in list(self.f.keys()): # iterate through all the groups try: g = self.f[g_label] except KeyError: # KeyError: 'Unable to open object (Bad object header version number)' continue # the following if statement is for the solution where no dataset in a group if len(g.keys()) < num_dataset: # if a number of dataset in this group is less than that of the previous group continue # go to the next iteration num_dataset = len(g.keys()) # update a number of datasets with that of the current group # get an output given by HierarchyChannelAboveThreshold() RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, duration = self.HierarchyChannelAboveThreshold(g, sigma, LowerCutOffFreq, UpperCutOffFreq) # stack the vector of counts MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount # append SNR ListSNR = np.append(ListSNR, SNR) # append confidence level ListConf = np.append(ListConf, confidence) # append GPS time ListGPS = np.append(ListGPS, GPS) # append duration ListDuration = np.append(ListDuration, duration) return MatCount, ListChannelName, ListSNR, ListConf, ListGPS, ListDuration else: # individual glitch MatCount = np.array([]) ListSNR = np.array([]) ListConf = np.array([]) # get an output given by HierarchyChannelAboveThreshold() RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, duration = self.HierarchyChannelAboveThreshold(g_Individual, sigma, LowerCutOffFreq, UpperCutOffFreq) # stack the vector of counts MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount # append SNR ListSNR = np.append(ListSNR, SNR) # append confidence level ListConf = np.append(ListConf, confidence) # normalize MatCount by a number of glitches MatCount = MatCount return MatCount, ListChannelName, ListSNR, ListConf
[docs] def CreateChannelTicks(self, ListChannelName): ''' description: 1. take the dominant sub-channel names 2. get a list of index where a sub-sensor name changes dependencies: (tacitly) CreateMatCount(), make_subset_channel_based_on_samplingrate() USAGE: CenterTicks, ListInd, ListSubsys = CreateChannelTicks(ListChannelName) :param ListChannelName: a list of channel names :return: CenterTicks: center value of a dominant sub sensor belongs ListInd: the edge indices of a dominant sub sensor belongs ListSubsys: a list of dominant sensor names ''' # +++ take the dominant sub-channel names +++++++++ ListSubsys = np.array([]) # a list of dominant sub sensor names for i in ListChannelName: # iterate over channel names Subsys = i.split('-')[0] # take before '-' ListSubsys = np.append(ListSubsys, Subsys) # append # ++++ get a list of index where a sub-sensor name changes +++++++++++ i = 'dummy' ListInd = np.array([]) # a list of indices where dominant channel name changes for ind, j in enumerate(ListSubsys, 0): # iterate dominant sub sensor names if i != j: # if name differencee occurs ListInd = np.append(ListInd, ind) # store an index i = j # rename current dominant sub name # append the end index ListInd = np.append(ListInd, np.array([ListSubsys.shape[0] - 1])) # find the center index of a dominant sub-channel CenterTicks = np.array([(i + j) / 2. if j - i != 1 else i for i, j in zip(ListInd[:-1], ListInd[1:])]) return CenterTicks, ListInd, ListSubsys
[docs] def PlotImportanceVSChannel(self, MatCount, ListChannelName, output_dir, output_file, ax=None): ''' description: - for all glitches in a class, this method works stand-alone 1. convert number to channel names in x ticks 2. color background based on channel types 3. plot a bar showing importance of channels USAGE: PlotImportanceVSChannel(MatCount, ListChannelName, output_dir, output_file) - for an individual glitch in a class, this method is used by .... 1. convert number to channel names in x ticks 2. color background based on channel types 3. plot a bar showing importance of channels dependencies: CreateChannelTicks() tacitly CreateMatCount() USAGE: PlotImportanceVSChannel(MatCount, ListChannelName, None, None, ax) # for on-line mode USAGE: PlotImportanceVSChannel(MatCount, ListChannelName, output_dir, output_file) @ for off-line mode :param MatCount: a matrix comprising importance versus channels :param ListChannelName: a list of channel names :param output_dir: (only used for all glitches) :param output_file: (only used for all glitches) :param ax: matplotlib.pyplot object (used only for an individual glitch) :return: None ''' if ax == None: # plot for all glitches in a class CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName) # color the background either with black or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == MatCount.shape[-1] - 1: # fix the background color to match plot size edgeRight = edgeRight + 1 ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) ax.set_edgecolor(None) if MatCount.shape[0] == MatCount.shape[-1]: # for an individual glitch MatCount = MatCount.reshape(1, MatCount.shape[-1]) #++++++++ plot bar plot about importance of channels ++++++++++++++++ # sum all the counts over glitches SumMatCount = np.sum(MatCount, axis=0) / float(MatCount.shape[0]) # normalize MatCount by a number of glitches #if MatCount.shape[0] > 1: # if there are more than 1 sample # STDMatCount = MatCount.std(axis=0, ddof=1) # standard deviation #else: # if there is only one sample # STDMatCount = np.array([None]*SumMatCount.shape[0]) # standard deviation # plot bars about importance plt.bar(np.arange(MatCount.shape[-1]), SumMatCount, 1, align='edge', edgecolor=None, color='blue', linewidth=0.) #+++++++++++ plot a table of ranked channels +++++++++++++++++++ # sort channel based on the sum of the counts i.e, importance # a list of sorted Channel names & sorted Importance RankChannel = list(zip(ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]])) #RankChannel = list(zip(ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]], STDMatCount[np.argsort(SumMatCount)[::-1]])) # make pandas matrix Impdf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Importance')) #Impdf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Importance', 'Std')) # add ranking label Impdf = Impdf.assign(rank=np.arange(1, MatCount.shape[-1] + 1, 1)) # round importance Impdf['Importance'] = Impdf['Importance'].astype(float).round(3) # = df.value1.round() #Impdf['Std'] = Impdf['Std'].astype(float).round(4) # re-sort columns to make rank labels coming first cols = Impdf.columns.tolist() cols = cols[-1:] + cols[:-1] Impdf = Impdf[cols] # take the first 5 high ranking channels dcsummary = Impdf.iloc[:5] # plot table on the top of the bar plot table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=[0.1, 0.6, 0.2]) #table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top',colWidths=[0.1, 0.6, 0.1, 0.1]) table.set_fontsize(8) #++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++ #++++++++++++++ alter ticks +++++++++++++++++ # plot xticks using the dominant sub sensor names plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)]) # put ticks above plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10) plt.tick_params(axis="y", labelsize=10) plt.grid(False) plt.xlim(0, ListSubsys.shape[0]) # set axis label plt.ylabel('Importance', fontsize=15) plt.xlabel('Auxiliary channel', fontsize=15) plt.tight_layout(rect=[0, 0.03, 1, 0.8]) # save a plot output_path = os.path.join(output_dir, output_file) plt.savefig(output_path) plt.close() print('Completed Saving a plot of Importance VS channel') return None else: # for an individual glitch # ++++++++++++++ alter ticks +++++++++++++++++ # get xticks for dominant channels CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannelName) # plot xticks using the dominant sub sensor names ax.set_xticks(CenterTicks) ax.set_xticklabels(ListSubsys[ListInd.astype(int)]) # put ticks above ax.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-35, length=0, labelrotation=90,size=0, labelsize=15) ax.tick_params(axis='y', labelsize=15) ax.set_xlim(0, ListSubsys.shape[0]) # color the background either with blakc or gray for each channel # ++++++++ plot bar plot about importance of channels ++++++++++++++++ Listcolor = ['black', 'gray'] * 20 * 7 for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == MatCount.shape[-1] - 1: # fix the background color to match plot size edgeRight = edgeRight + 1 ax.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) if MatCount.shape[0] == MatCount.shape[-1]: # for an individual glitch MatCount = MatCount.reshape(1, MatCount.shape[-1]) # ++++++++ plot bar plot about importance of channels ++++++++++++++++ # sum all the counts over glitches SumMatCount = np.sum(MatCount, axis=0) # +++++++++++ plot a table of high ranking channels +++++++++++++++++++ # sort channel based on the sum of the counts i.e, importance # a list of sorted Channel names & sorted Importance RankChannel = list( zip(ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]])) # make pandas matrix Impdf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Importance')) # add ranking label Impdf = Impdf.assign(rank=np.arange(1, MatCount.shape[-1] + 1, 1)) # round importance Impdf['Importance'] = Impdf['Importance'].astype(float).round(3) # = df.value1.round() # re-sort columns to make ranking labels coming first cols = Impdf.columns.tolist() cols = cols[-1:] + cols[:-1] Impdf = Impdf[cols] # take the first 5 high ranking channels dcsummary = Impdf.iloc[:5] # plot bars about importance ax.bar(np.arange(MatCount.shape[-1]), SumMatCount, 1, align='edge', edgecolor=None, color='blue', linewidth=0.) # plot table on the top of the bar plot table = ax.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=[0.1, 0.6, 0.2]) table.set_fontsize(15) table.scale(0.5, 2) # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++ # set axis label ax.set_ylabel('Importance', fontsize=20) ax.set_xlabel('Auxiliary channel', fontsize=20) ax.grid(False) return None
[docs] def PlotOccurrenceVSChannel(self, MatCount, ListChannelName, ListGPS, ListDuration, output_dir, output_file): ''' description: plot glitch indecies versus channels USAGE: PlotOccurrenceVSChannel(MatCount, ListChannelName, ListGPS, ListDuration, output_dir, output_file) :param MatCount: a matrix comprising importance versus channels :param ListChannelName: a list of channel names :param ListGPS: a list of GPS times :param ListDuration: a list of durations :param output_dir: :param output_file: :return: None dependencies: CreateChannelTicks() ''' ########## #MatCount = MatCount[:, ::-1] #ListChannelName = ListChannelName[::-1] #############3 if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful print('There is only one single glitch. Skip the plot of glitch index VS channels') return None # color the background either with blakc or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 # get xticks for dominant channels CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName) fig = plt.figure() # make grid for subplots gs = GridSpec(9, 30) # a subplot showing channel position ax1 = fig.add_subplot(gs[:1, :-1]) # First row, up to the second last column for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == MatCount.shape[-1] - 1: # adjust the right edge of the last color span edgeRight = edgeRight + 1 ax1.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) # limit the xaxis of the first subplot ax1.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5) # get rid of ticks of the first plot ax1.yaxis.set_major_locator(plt.NullLocator()) ax1.xaxis.set_major_locator(plt.NullLocator()) # assign a size for the second subplot ax2 = fig.add_subplot(gs[1:, :-1]) #MatCount = np.log10(MatCount+1) # CMAP = 'viridis' CMAP = 'OrRd' # change the color map ax2.imshow(MatCount, aspect='auto', interpolation='nearest', cmap=getattr(plt.cm, CMAP), origin='upper') ax2.set_xticks(CenterTicks) ax2.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5) ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10) ax2.tick_params(axis="y", labelsize=10) ax2.grid(False) ax2.set_ylabel('Glitch index', fontsize=15) ax2.set_xlabel('Auxiliary channel', fontsize=15) ###### Make a color bar cmap = getattr(matplotlib.cm, CMAP) norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max()) ax3 = fig.add_subplot(gs[1:, -1]) cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False) ax3.grid(False) cbar.set_label(r'$q$', size=15) # cbar.set_label('Importance', size=10) ######## fig.subplots_adjust(hspace=0) output_path = os.path.join(output_dir, output_file) fig.savefig(output_path) plt.close() print('Completed Saving a plot of Glitch occurrence VS channel') #-----------------------------------------------------------------+ # make .csv file comprising GPS, importance of all the channels | #-----------------------------------------------------------------+ # concatenate a list of GPS times and importance of all the channels MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), MatCount), axis=1) # make a list of column names columnName = np.append(np.array(['GPS']), np.array(['duration'])) columnName = np.append(columnName, ListChannelName) # convert it to a pandas frame dfGPSDurationImportanceChannels = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName) path_output_fileGPSImportanceChannels = os.path.join(output_dir, 'GPSDurationImportanceChannels.csv') # save it as .csv file dfGPSDurationImportanceChannels.to_csv(path_output_fileGPSImportanceChannels) print('Completed saving a table comprising GPS times, durations, and importance of the channels') return None
[docs] def MakeMatrixOccurrenceVSChannel(self, MatCount, ListChannelName, ListGPS, ListDuration, output_dir): ''' description: save matrix of GPS, duration, importance as .csv file USAGE: PlotOccurrenceVSChannel(MatCount, ListChannelName, ListGPS, ListDuration, output_dir, output_file) :param MatCount: a matrix comprising importance versus channels :param ListChannelName: a list of channel names :param ListGPS: a list of GPS times :param ListDuration: a list of durations :param output_dir: :return: None dependencies: CreateChannelTicks() ''' if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful MatCount = MatCount.reshape(1, MatCount.shape[0]) # make row vector #-----------------------------------------------------------------+ # make .csv file comprising GPS, importance of all the channels | #-----------------------------------------------------------------+ # concatenate a list of GPS times and importance of all the channels MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), MatCount), axis=1) # make a list of column names columnName = np.append(np.array(['GPS']), np.array(['duration'])) columnName = np.append(columnName, ListChannelName) # convert it to a pandas frame dfGPSDurationImportanceChannels = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName) path_output_fileGPSImportanceChannels = os.path.join(output_dir, 'GPSDurationImportanceChannels.csv') # save it as .csv file dfGPSDurationImportanceChannels.to_csv(path_output_fileGPSImportanceChannels) print('Completed saving a table comprising GPS times, durations, and importance of the channels') return None
[docs] def PlotTimeVSChannel(self, MatCount, ListChannelName, ListGPS, output_dir, output_file, startT, endT, dt): ''' description: make a plot of glitch times versus channels where the time flow is from top to bottom If there are more than one glitch in a time bin, those gltiches' values of importance are averaged If there is no glitches in a time bin, all the values of importance are set to be zero dependencies: CreateChannelTicks() USAGE: PlotTimeVSChannel(MatCount, ListChannelName, output_dir, output_file, startT, endT, dt) :param MatCount: a matrix comprising importance versus channels :param ListChannelName: a list of channel names :param ListGPS: a list of GPS times :param output_dir: a path to an output directory :param output_file: a name of an output file :param startT: start time of an epoch :param endT: end time of an epoch :param dt: step size in sec of the time slice :return: None ''' if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful print('There is only one single glitch. Skip the plot of glitch index VS channels') return None # ################ # ListChannelName = np.array(df.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist()) # ListGPS = df.GPS # dt = 60 # MatCount = df.drop(['Unnamed: 0', 'GPS'], axis=1).values # ####################### TimeSlice = np.arange(0, endT - startT, dt) # time steps from the beginning of the epoch with the step size of dt in # make a matrix with all the elements equal to zero, the size of this matrix is # of the time slice in row , # of channels in column TimeSlice_MatCount = np.zeros((TimeSlice.shape[0], MatCount.shape[-1])) # convert the GPS times of the sample to the time since the beginning of the epoch Time_sample = ListGPS - startT # fill the values of the importance of the sample into the time slices for ind in range(len(TimeSlice)): # iterate though the time slice # find the index of the samples that are within this time slice index_within = np.where((Time_sample >= TimeSlice[ind]) & (Time_sample < TimeSlice[ind] + dt))[0] if index_within.size > 0: # if more than one sample is in this time slice # take the average of the values of importance of the samples in this time slice, channel by channel TimeSlice_MatCount[ind, :] = MatCount[index_within[0]:index_within[-1]+1, :].mean(axis=0) else: # if no samples is in this time slice, go to the next iteration pass # color the background either with blakc or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 # get xticks for dominant channels CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName) fig = plt.figure() # make grid for subplots gs = GridSpec(9, 30) # a subplot showing channel position ax1 = fig.add_subplot(gs[:1, :-1]) # First row, up to the second last column for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == MatCount.shape[-1]: # adjust the right edge of the last color span edgeRight = edgeRight + 1 ax1.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) # limit the xaxis of the first subplot ax1.axis(xmin=0, xmax=TimeSlice_MatCount.shape[-1] * 1.001) # get rid of ticks of the first plot ax1.yaxis.set_major_locator(plt.NullLocator()) ax1.xaxis.set_major_locator(plt.NullLocator()) # assign a size for the second subplot ax2 = fig.add_subplot(gs[1:, :-1]) #MatCount = np.log10(MatCount+1) ax2.imshow(TimeSlice_MatCount, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper') ax2.set_xticks(CenterTicks) ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10) ax2.tick_params(axis="y", labelsize=10) ax2.grid(False) ax2.axis(xmin=0, xmax=TimeSlice_MatCount.shape[-1] * 1.001) start_epoch_UTC = tconvert(startT) # make gwpy object of UTC time # set y label if dt == 60: unit_time = 'mins' elif dt == 3600: unit_time = 'hours' elif dt == 1: unit_time = 'secs' ax2.set_ylabel('Time [{0}] since {1}-{2:02d}-{3:02d} {4:02d}:{5:02d}:{6:02d} UTC'.format(unit_time, start_epoch_UTC.year, start_epoch_UTC.month, start_epoch_UTC.day, start_epoch_UTC.hour, start_epoch_UTC.minute, start_epoch_UTC.second), fontsize=15) ax2.set_xlabel('Auxiliary channel', fontsize=15) ###### Make a color bar cmap = matplotlib.cm.viridis norm = matplotlib.colors.Normalize(vmin=TimeSlice_MatCount.min(), vmax=TimeSlice_MatCount.max()) ax3 = fig.add_subplot(gs[1:, -1]) cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False) ax3.grid(False) cbar.set_label('Importance', size=10) ######## fig.subplots_adjust(hspace=0) output_path = os.path.join(output_dir, output_file) fig.savefig(output_path) plt.close() print('Completed Saving a plot of time VS channel') return None
[docs] def PlotSNRVSChannel(self, MatCount, ListChannelName, ListSNR, output_dir, output_file): ''' description: make a plot of SNR of h(t) versus channels dependencies: CreateChannelTicks() USAGE: PlotSNRVSChannel(self, MatCount, ListChannelName, ListSNR, output_dir, output_file) :param MatCount: a matrix comprising importance versus channels :param ListChannelName: a list of channel names :param ListSNR: a list SNRs :param output_dir: :param output_file: :return: None ''' if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful print('There is only one single glitch. Skip the plot of SNR VS channels') return None # color the background either with blakc or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 # get xticks for dominant channels CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName) fig = plt.figure() # make grid for subplots gs = GridSpec(9, 30) # a subplot showing channel position ax1 = fig.add_subplot(gs[:1, :-1]) for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == MatCount.shape[-1] - 1: # adjust the right edge of the last color span edgeRight = edgeRight + 1 ax1.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) # limit the xaxis of the first subplot ax1.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5) # get rid of ticks of the first plot ax1.yaxis.set_major_locator(plt.NullLocator()) ax1.xaxis.set_major_locator(plt.NullLocator()) # assign a size for the second subplot ax2 = fig.add_subplot(gs[1:, :-1]) # First row, first column # sort based on SNR values MatCountSortedSNR = np.array([MatCount[i, :] for i in np.argsort(ListSNR)[::-1]]) #MatCountSortedSNR = np.log10(MatCountSortedSNR+1) ax2.imshow(MatCountSortedSNR, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper') ax2.set_xticks(CenterTicks) ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10) ax2.tick_params(axis="y", labelsize=10) ax2.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5) ax2.grid(False) fig.subplots_adjust(hspace=0) y_ticks = np.sort(ListSNR)[::-1] # descending values of SNR # take dominant dicks, e.g, when the first digit changes ListYticksNew = [0] ListYTicksOld = [] SNR_range = y_ticks.max() - y_ticks.min() # the range of SNR if SNR_range > 1000: # if the range of SNR is large (e.g., 100) RoundPoint = -2 # round up at 100th elif SNR_range > 100: RoundPoint = -1 # round up at 10th else: RoundPoint = 0 # round up at 1th for ind, i in enumerate(np.round(y_ticks, RoundPoint), 0): # iterate over rounded numbers of SNR if ListYticksNew[-1] != i: # if difference changes compared with the previous value ListYticksNew.append(i) ListYTicksOld.append(ind) ListYticksNew = ListYticksNew[1:] ax2.set_yticks(ListYTicksOld) ax2.set_yticklabels(ListYticksNew) ax2.set_ylabel('SNR of h(t)', fontsize=15) ax2.set_xlabel('Auxiliary channel', fontsize=15) ###### Make a color bar cmap = matplotlib.cm.viridis norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max()) ax3 = fig.add_subplot(gs[1:, -1]) cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False) ax3.grid(False) cbar.set_label('Importance', size=10) ######## output_path = os.path.join(output_dir, output_file) fig.savefig(output_path) plt.close() print('Completed Saving a plot of SNR VS channel') return None
[docs] def PlotConfidenceVSChannel(self, MatCount, ListChannelName, ListConf, output_dir, output_file): ''' make a plot of values of confidence level of Gravity Spy versus channels dependencies: CreateChannelTicks() USAGE: PlotConfidenceVSChannel(MatCount, ListChannelName, ListConf, output_dir, output_file) :param MatCount: a matrix comprising importance versus channels :param ListChannelName: a list of channel names :param ListConf: a list confidence levels :param output_dir: :param output_file: :return: None ''' if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful print('There is only one single glitch. Skip the plot of confidence level VS channels') return None # color the background either with blakc or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 # get xticks for dominant channels CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName) fig = plt.figure() # make grid for subplots gs = GridSpec(9, 30) # 9 rows, 25 columns # a subplot showing channel position ax1 = fig.add_subplot(gs[:1, :-1]) # First row, first column for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == MatCount.shape[-1] - 1: # adjust the right edge of the last color span edgeRight = edgeRight + 1 ax1.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) # limit the xaxis of the first subplot ax1.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5) # get rid of ticks of the first plot ax1.yaxis.set_major_locator(plt.NullLocator()) ax1.xaxis.set_major_locator(plt.NullLocator()) # assign a size for the second subplot ax2 = fig.add_subplot(gs[1:, :-1]) # First row, first column # sort based on SNR values MatCountSortedConf = np.array([MatCount[i, :] for i in np.argsort(ListConf)[::-1]]) #MatCountSortedConf = np.log10(MatCountSortedConf + 1) ax2.imshow(MatCountSortedConf, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper') ax2.set_xticks(CenterTicks) ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10) ax2.tick_params(axis="y", labelsize=10) ax2.grid(False) ax2.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5) fig.subplots_adjust(hspace=0) y_ticks = np.sort(ListConf)[::-1] ListYticksNew = [0] ListYTicksOld = [] # take dominant dicks, e.g, when the first decimal point changes for ind, i in enumerate(np.round(y_ticks, 1), 0): if ListYticksNew[-1] != i: ListYticksNew.append(i) ListYTicksOld.append(ind) ListYticksNew = ListYticksNew[1:] ax2.set_yticks(ListYTicksOld) ax2.set_yticklabels(ListYticksNew) ax2.set_ylabel('Confidence level', fontsize=15) ax2.set_xlabel('Auxiliary channel', fontsize=15) ###### Make a color bar cmap = matplotlib.cm.viridis norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max()) ax3 = fig.add_subplot(gs[1:, -1]) cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False) ax3.grid(False) cbar.set_label('Importance', size=10) ######## output_path = os.path.join(output_dir, output_file) fig.savefig(output_path) plt.close() print('Completed Saving a plot of Confidence VS channel') return None
[docs] def TableImportance(self, MatCount, ListChannelName, output_dir, output_file): ''' make a table of values of importance as a .csv file USAGE: TableImportance(MatCount, ListChannelName, output_dir, output_file) :param MatCount: a matrix comprising importance versus channels :param ListChannelName: a list of channel names :param output_dir: :param output_file: :return: None ''' if MatCount.shape[0] == MatCount.shape[-1]: # for a matrix with one row MatCount = MatCount.reshape(1, MatCount.shape[-1]) # sum all the counts over glitches SumMatCount = np.sum(MatCount, axis=0) / float(MatCount.shape[0]) # a list of sorted Channel names & sorted Importance RankChannel = list(zip(np.argsort(SumMatCount)[::-1], ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]])) # make pandas matrix Impdf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 'Importance')) # add rank label Impdf = Impdf.assign(rank=np.arange(1, MatCount.shape[-1] + 1, 1)) # round importance Impdf['Importance'] = Impdf['Importance'].astype(float).round(3) # = df.value1.round() # re-sort columns to make rank labels coming first cols = Impdf.columns.tolist() cols = cols[-1:] + cols[:-1] Impdf = Impdf[cols] output_path = os.path.join(output_dir, output_file) # save it Impdf.to_csv(output_path) print('Completed saving a table of importance of channels') return None
[docs] def BinomialDist(self, k, n, likelihood): ''' description: Binomial distribution USAGE: out = BinomialDist(self, k, n, likelihood) :param k: the number of detections :param n: the number of trials :param p: likelihood of detection :return: out: a value of probability density to find k detection out of n trials ''' #pro_dens = factorial(n) / factorial(k) / factorial(n - k) * likelihood ** k * (1 - likelihood) ** (n - k) # use the scipy.stats as the equation above fails for a large value of n, or k pro_dens = binom.pmf(k, n, likelihood) # binomial probability mass function return pro_dens
[docs] def BinomialTest(self, k, N, rate): ''' description: compute p-value of one-tailed Binomial test against null rate USAGE: p_value = BinomialTest(k, N, rate) :param k: observed numberof successes :param N: total number of samples :param rate: rate of successes drawn from a null hypothesis :return: p_value: probablity of successes equal or greater than the observed number of successes ''' #---- the manual calculate is much slower than scipy.stats.binom_test so that it is replaced #list_k = np.linspace(k, N, N - k + 1) #list_pro = np.vectorize(self.BinomialDist)(list_k, N, rate) #p_value = list_pro.sum() #----------------------------- if k/float(N) >= rate: # one-sided binomial test condition applied p_value = binom_test(k, N, rate)/2. # use scipy.stats else: # one-sided binomial test condition applied p_value = 1 - binom_test(k, N, rate)/2. # use scipy stats return p_value
[docs] def query_targetglitch_null(self, path_target_glitch_dataset, path_null_dataset): ''' description: load files, otherwise, end the program :param path_target_glitch_dataset: a path to .csv file of a target gltich class :param path_null_dataset: a path to .csv file of a null dataset :return: df1: a pandas dataframe of a target glitch class df0: a pandas datafram of a null dataset ''' try: df1 = pd.read_csv(path_target_glitch_dataset) except IOError: print('No such a file {}'.format(path_target_glitch_dataset)) sys.exit() try: df0 = pd.read_csv(path_null_dataset) except IOError: print('No such a file {}'.format(path_null_dataset)) sys.exit() return df1, df0
[docs] def Student_t_independet_test(self, data1, data2, Welch_test=True): ''' Independent Student t-test USAGE: stat, p_value = self.Student_t_independet_test(data1, data0) :param data1: a population :param data2: another population :return: t_value: t-value (critical value) p: p-value ''' if data1.std(ddof=1) + data2.std(ddof=1) == 0: # if the sum of the variances are zero, t-value will be infinity t_value = 0 # manually set zero p_value = 1 # manually set 1 return t_value, p_value if Welch_test == True: t_value, p_value = ttest_ind(data1, data2, equal_var=False) else: t_value, p_value = ttest_ind(data1, data2) return t_value, p_value
[docs] def chisquare_test(self, target_mat, null_mat): ''' description: calculate the values of chi-square of each channel in each glitch and put out values of chi-square and corresponding p-values USAGE: chi2_value, p_value = chisquare_test(self, target_mat, null_mat) :param target_mat: a matrix of target samples where glitch indices are in rows and channels are in columns :param null_mat: a matrix of null samples where glitch indices are in rows and channels are in columns :return: chi2_value: a matrix of chi-square values where glitch indices are in rows and channels are in columns p_value: a matrix of p-values where glitch indices are in rows and channels are in columns ''' # degrees of freedom is 1 for a single point chi-square test ddof = 1 mean_null = null_mat.mean(axis=0) # calcualte the mean of null samples channel by channel std_null = null_mat.std(ddof=ddof, axis=0) # calculate the standard deviation of the null samples channel by channel # chi-square values chsqr_values = ((target_mat - mean_null) / std_null) ** 2 # filter (correct) # when std_null is zero # chi-square value will nan when the nominator is zero (it's supposed to be zero) # chi-square value will be infinite when the nominator is non-zero where_are_NaNs = np.isnan(chsqr_values) # indices where elements are nan chsqr_values[where_are_NaNs] = 0 # replace with zero where_are_inf = np.isposinf(chsqr_values) # indices where elements are positive infinite chsqr_values[where_are_inf] = 0 # replace with zero (forcefully set even if not correct) # calculate p-value of the right-sided chi-square test p_values = 1 - chi2.cdf(chsqr_values, ddof) return chsqr_values, p_values
[docs] def find_channels(self, df_target): ''' description: find a list of channels from the target samples USAGE: channels = find_channels(df_target) :param df_target: target samples in pandas frame :return: channels: a list of channels ''' try: channels = np.array(df_target.drop(['Unnamed: 0', 'GPS', 'duration'], axis=1).columns.tolist()) # new format of the table except ValueError: channels = np.array(df_target.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist()) # old format of the table return channels
[docs] def Calculate_number_and_rate(self, df_target, df_null, d_c, err_cal, channels): ''' description: 1. use the target and null samples 2. calculate the numbers of the target and null samples above threshold 3. calculate the fraction of the target and null samples above threshold USAGE: channels, list_num1, list_num0, total_sample1, total_sample0, list_Causal, list_causal_err = Calculate_number_and_rate(df_target, df_null, d_c=None, err_cal=False, channels=None) :param df_target: target samples in pandas frame :param df_null: null samples in pandas frame :param d_c: a threshold. if it is None, the threshold is the mean value of null samples :param err_cal: boolen, wether calculate the error of the statistics or not :param channels: a list of channels :return: channels: a list of channels list_num1: a list of the number of the target samples above a threshold list_num0: a list of the number of the null samples above a threshold total_sample1: the number of the target samples, float total_sample0: the number of the null samples, float list_Causal: a list of the statistics list_causal_err: a list of the errors of the statistics. It is None if err_cal = False ''' if type(channels) == int: # channels are not provided, it uses all the channels in the target samples # find a list of channels from the target samples channels = self.find_channels(df_target) else: pass list_num1 = np.array([]) # a list of a number of samples of target glitches that have values greater than the detection threshold list_num0 = np.array([]) # a list of a number of samples of dummpy quite dataset that have values greater than the detection threshold list_Causal = np.array([]) # a list of the causal probability list_causal_err = np.array([]) # a list of the error of the causal probability total_sample1 = float(df_target.shape[0]) # total number of target glitches total_sample0 = float(df_null.shape[0]) # total number of dummy quite samples for ch in channels: # iterate though channels # calculate the threshold of claiming detection of a glitch if d_c == None: d_c = df_null['{}'.format(ch)].mean() else: pass # calculate the number of samples that have values of importance greater than the detection threshold num_d1 = df_target[df_target['{}'.format(ch)] > d_c].shape[0] num_d0 = df_null[df_null['{}'.format(ch)] > d_c].shape[0] # calculate the fraction samples that have values greater than the detection threshold rate1 = num_d1 / total_sample1 rate0 = num_d0 / total_sample0 # calculate the probability of the causality try: Causal = rate1 / (rate1 + rate0) except ZeroDivisionError: # if the denominator is zero, manually set it zero (it is ok, this channel should not pass Binomial test) Causal = 0 if err_cal == True: # calculate the error of the causal probability err1 = np.sqrt(rate1 * (1 - rate1) / float(total_sample1)) # the error of true positive based on multinominal distribution err0 = np.sqrt(rate0 * (1 - rate0) / float(total_sample0)) # the error of false positive based on multinominal distribution try: causal_err = (rate1 + rate0) **(-2) * np.sqrt(rate0**2 * err1**2 + rate1**2 + err0**2) # based on the error propagation except ZeroDivisionError: causal_err = 0 else: pass # fill the values list_num1 = np.append(list_num1, num_d1) list_num0 = np.append(list_num0, num_d0) list_Causal = np.append(list_Causal, Causal) if err_cal == True: list_causal_err = np.append(list_causal_err, causal_err) else: list_causal_err = None return channels, list_num1, list_num0, total_sample1, total_sample0, list_Causal, list_causal_err
[docs] def find_meaing_ful_confidence(self, df1, df0, BinomialTestConfidence, d_c=None, err_cal=False, channels=0): ''' description 1. load files of target glitches (df1) and dummy quite data set (df0) 2. compute true positive probability 3. perform one-tailed Binomial test USAGE: list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, list_channels= find_meaing_ful_confidence(df1, df0, BinomialTestConfidence, d_c) :param df1: target glitches in the pandas format :param df0: null dataset in the pandas format :param BinomialTestConfidence: a confidence level used for one-tailed Binomial test :param d_c: user defined threshold to claim detection of a glitch, None in default :param channels: a list of channels, 0 in default if d_c is None, values of threshold is given by the mean value of importance generated by the dummy quite dataset :return: list_Causal_passed: a list of the probability of the causality that are passed one-tailed Binomial test, otherwise, zero list_Causal_fail: a list of the probability of the causality that are failed one-tailed Binomial test, otherwise, zero list_causal_passed_err: a list of the error of the causal probability that are passed one-tailed Binomial test, otherwise, zero list_causal_failed_err: a list of the error of the causal probability that are failed one-tailed Binomial test, otherwise, zero list_Test: a list of results of the Binomial test, 'pass' or 'fail' channels: a list of channel names ''' channels, list_num1, list_num0, total_sample1, total_sample0, list_Causal, list_causal_err = self.Calculate_number_and_rate(df1, df0, d_c=d_c, err_cal=err_cal, channels=channels) list_rate0 = list_num0 / total_sample0 # a list of the fractions of the null samples above the threshold list_Causal_passed = np.zeros(list_Causal.shape[0]) # initial values are zero if err_cal == True: # if the error of the causal probability is calculated list_causal_passed_err = np.zeros(list_causal_err.shape[0]) # initial values are zero else: # other wise, it is None list_causal_passed_err = None list_Test = np.array([]) # perform Binomial test for index in range(len(channels)): # iterate though the index of channels num1 = list_num1[index] # the number of times when this channel shows values greater than the detection threshold rate0 = list_rate0[index] # the fraction of samples of the dummy quite dataset shows values greater than the detection threshold # compute p-values of one-tailed Binomial test against null hypothesis p_value = self.BinomialTest(num1, total_sample1, rate0) if p_value < (1 - BinomialTestConfidence): # if pass {BinomialTestConfidence}% confidence level list_Causal_passed[index] = list_Causal[index] # fill the probability of the causality. If a channel passes the test, the causal probability is non-zero value, otherwise, zero. This process is needed for the plottting if err_cal == True: list_causal_passed_err[index] = list_causal_err[index] # fill the error of the probability of the causality. else: pass list_Test = np.append(list_Test, 'pass') # fill the mark of pass else: list_Test = np.append(list_Test, 'fail') # fill the mark of fail list_Causal_fail = list_Causal - list_Causal_passed # a list of the probability of the causality that failed the one-tailed Binomial test if err_cal == True: # if the error of the causal probability is calculated list_causal_failed_err = list_causal_err - list_causal_passed_err # a list of the error of the probability of the causality that failed the one-tailed Binomial test else: # otherwise, it is None list_causal_failed_err = None return list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, channels
[docs] def PlotCausalityVSChannel(self, list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, ListChannelName, output_dir, output_file, BinomialTestConfidence, freq_bands=Const.freq_bands): ''' description: Calculate the probabilities of the causality of channels USAGE: PlotCausalityVSChannel(list_Causal_passed, list_Causal_fail, list_Test, ListChannelName, output_dir, output_file, BinomialTestConfidence) :param list_Causal_passed: a list of the probability of the causality that are passed one-tailed Binomial test, if not zero :param list_Causal_fail: a list of the probability of the causality that are failed one-tailed Binomial test, if not zero :param list_causal_passed_err: a list of the error of the causal probability that are passed one-tailed Binomial test, otherwise, zero :param list_causal_failed_err: a list of the error of the causal probability that are failed one-tailed Binomial test, otherwise, zero :param list_Test: a list of results of the Binomial test, 'pass' or 'fail' :param ListChannelName: a list of channel names :param output_dir: (only used for all glitches) :param output_file: (only used for all glitches) :param BinomialTestConfidence: binomial test confidence level :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py :return: None ''' CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannelName) # color the background either with black or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == list_Causal_passed.shape[-1] - 1: # fix the background color to match plot size edgeRight = edgeRight + 1 ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) ax.set_edgecolor(None) # ++++++++ plot bar plot about the probability of the causality that pass or fail the Binomial test and ++++++++++++++++ #plt.bar(np.arange(list_Causal_passed.shape[-1]), list_Causal_passed, 1, yerr=list_causal_passed_err, align='center', edgecolor='blue', color='blue', linewidth=0.) #plt.bar(np.arange(list_Causal_fail.shape[-1]), list_Causal_fail, 1, yerr=list_causal_failed_err, align='center', edgecolor='red', alpha=0.3, color='red', linewidth=0.) plt.bar(np.where(list_Causal_fail != 0)[0], list_Causal_fail[list_Causal_fail != 0], 1, align='edge', edgecolor=None, color='lightblue', alpha=0.3, linewidth=0.) plt.bar(np.where(list_Causal_passed != 0)[0], list_Causal_passed[list_Causal_passed != 0], 1, align='edge', edgecolor=None, color='blue', linewidth=0.) # +++++++++++ plot a table of ranking channels +++++++++++++++++++ # a list of the probability of the causality list_Causal = list_Causal_passed + list_Causal_fail # list of the error of the probability of the causality # list_causal_err = list_causal_passed_err + list_causal_failed_err # sort channel based on the probability of the causality if ListChannelName[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP# # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All" list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands]) # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ... # A list of the corresponding frequency bands are created # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...] # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ... list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), ListChannelName.shape[0], axis=1).flatten(order='F') # get rid of the BD# at the end of the channel name to make it original name ListChannelName = np.array([ChannelName.rsplit("_BP", 1)[0] for ChannelName in ListChannelName]) # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test) RankChannel = list(zip(ListChannelName[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'WRS', 'Test{}'.format(BinomialTestConfidence))) # define the table size ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15] else: list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test) RankChannel = list(zip(ListChannelName[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'WRS', 'Test{}'.format(BinomialTestConfidence))) # define the table size ColWidths = [0.05, 0.6, 0.1, 0.15] # add rank label Prodf = Prodf.assign(rank=np.arange(1, list_Causal_passed.shape[-1] + 1, 1)) # round importance Prodf['WRS'] = Prodf['WRS'].astype(float).round(3) # = df.value1.round() # re-sort columns to make rank labels coming first cols = Prodf.columns.tolist() cols = cols[-1:] + cols[:-1] Prodf = Prodf[cols] # take the first 5 high ranking channels dcsummary = Prodf.iloc[:5] # plot table on the top of the bar plot # table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=[0.1, 0.6, 0.2]) table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths) table.auto_set_font_size(False) table.set_fontsize(6) # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++ # ++++++++++++++ alter ticks +++++++++++++++++ # plot xticks using the dominant sub sensor names plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)]) # put ticks above plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10) plt.tick_params(axis="y", labelsize=10) plt.grid(False) plt.xlim(0, ListSubsys.shape[0]) # set axis label plt.ylabel('Witness ratio statistic', fontsize=15) plt.xlabel('Auxiliary channel', fontsize=15) plt.tight_layout(rect=[0, 0.03, 1, 0.8]) # save a plot output_path = os.path.join(output_dir, output_file) plt.savefig(output_path) plt.close() print('Completed Saving a plot of the witness ratio statisti VS channel') return None
[docs] def TableCausality(self, list_Causal_passed, list_Causal_fail, list_Test, ListChannelName, output_dir, output_file, BinomialTestConfidence, freq_bands=Const.freq_bands): ''' make a table of witness ratio statistic (WRS) of channels as a .csv file USAGE: TableCausality(self, list_Causal_passed, list_Causal_fail, list_Test, ListChannelName, output_dir, output_file) :param list_Causal_passed: a list of the probability of the causality that are passed one-tailed Binomial test, if not zero :param list_Causal_fail: a list of the probability of the causality that are failed one-tailed Binomial test, if not zero :param list_Test: a list of results of the Binomial test, 'pass' or 'fail' :param ListChannelName: a list of channel names :param output_dir: :param output_file: :param BinomialTestConfidence: binomial test confidence level :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py :return: None ''' # a list of the probabilit of the causality list_Causal = list_Causal_passed + list_Causal_fail if ListChannelName[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP# # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All" list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands]) # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ... # A list of the corresponding frequency bands are created # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...] # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ... list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), ListChannelName.shape[0], axis=1).flatten(order='F') # get rid of the BD# at the end of the channel name to make it original name ListChannelName = np.array([ChannelName.rsplit("_BP", 1)[0] for ChannelName in ListChannelName]) # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test) RankChannel = list(zip(ListChannelName[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'WRS', 'Test{}'.format(BinomialTestConfidence))) else: # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test) RankChannel = list(zip(list_sorted_base_index_pass_fail, ListChannelName[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 'WRS', 'Test{}'.format(BinomialTestConfidence))) # add rank label Prodf = Prodf.assign(rank=np.arange(1, list_Causal.shape[-1] + 1, 1)) # round importance Prodf['WRS'] = Prodf['WRS'].astype(float).round(3) # = df.value1.round() # re-sort columns to make rank labels coming first cols = Prodf.columns.tolist() cols = cols[-1:] + cols[:-1] Prodf = Prodf[cols] output_path = os.path.join(output_dir, output_file) # save it Prodf.to_csv(output_path, index=False) print('Completed saving a table of the witness ratio statistic of channels') return None
# def calculate_FAP(self, df1, df0, mode): # ''' # description: # 1. calculate false alarm probability (FAP) # USAGE: df_FAP = calculate_FAP(df1, df0, mode='indiv') # # :param df1: a target smaples in pandas frame # :param df0: a null samples in pandas frame # :param mode: a mode to calculate FAP. 'whole' calculates FAP of the average value of importance of glitches 'indiv' calculates FAP of each glitch in each channel # :return: # if mode == 'whole': # list_FAP: numpy array of FAPs of channels # list_FAP_err: numpy array of errors of FAP of channels # channels: a list of channels # USAGE: list_FAP, list_FAP_err = calculate_FAP(df1, df0, mode='whole') # elif mode == 'indiv': # df_FAP: a matrix of FAP of each glitch in each channel in pandas frame # channels: a list of channels # ''' # try: # channels = np.array(df1.drop(['Unnamed: 0', 'GPS', 'duration'], axis=1).columns.tolist()) # except ValueError: # channels = np.array(df1.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist()) # if mode == 'whole': # calculate false alarm probablity (FAP) of the average value of importance of target glitches # list_FAP = np.array([]) # list_FAP_err = np.array([]) # total_sample0 = df0.shape[0] # total number of dummy quite samples # for ch in channels: # iterate though channels # # calculate the threshold of claiming detection of a glitch # d_c = df1['{}'.format(ch)].mean() # # calculate the number of samples that have values of importance greater than the detection threshold # num_d0 = df0[df0['{}'.format(ch)] > d_c].shape[0] # # calculate the fraction samples that have values greater than the threshold, FAP # FAP = num_d0 / float(total_sample0) # # calculate the probability of the causality # FAP_err = np.sqrt(FAP * (1 - FAP) / float(total_sample0)) # the error of false positive based on multinominal distribution # # fill the values # list_FAP = np.append(list_FAP, FAP) # list_FAP_err = np.append(list_FAP_err, FAP_err) # return list_FAP, list_FAP_err, channels # if mode == 'indiv': # df_FAP = pd.DataFrame() # total_sample0 = float(df0.shape[0]) # total number of dummy quite samples # for ch in channels: # iterate though channels # # calculate FAP of each glitch for this channel # # note that d_c is measured importance of this glitch in this channel # # note that [df0[df0['{}'.format(ch)] > d_c].shape[0]/total_sample0 is a fraction of null samples above d_c, which is FAP # FAP = np.array([df0[df0['{}'.format(ch)] > d_c].shape[0]/total_sample0 for d_c in df1['{}'.format(ch)]]) # df_FAP['{}'.format(ch)] = FAP # return df_FAP, channels
[docs] def calculate_fap(self, target_mat, null_mat): ''' description: calculate the values of fap of each channel in each glitch USAGE: faps = calculate_fap(target_mat, null_mat) :param target_mat: a matrix of target samples where glitch indices are in rows and channels are in columns :param null_mat: a matrix of null samples where glitch indices are in rows and channels are in columns :return: fap ''' list_mean_null = null_mat.mean(axis=0) # calcualte the mean of null samples channel by channel list_var_null = null_mat.var(ddof=1, axis=0) # calculate the standard deviation of the null samples channel by channel list_zero_mean_null_index = np.where(list_mean_null == 0)[0] # find the indecies where the mean of the null equal to zero list_zero_var_null_index = np.where(list_var_null == 0)[0] # find the indecies where the variance of the null equal to zero list_both_zero_mean_var_null_index = np.intersect1d(list_zero_mean_null_index, list_zero_var_null_index) list_a_null = np.array([mean_null ** 2 * (1. - mean_null) / var_null - mean_null if mean_null != 0 and var_null != 0 else -1 for mean_null, var_null in zip(list_mean_null, list_var_null)]) # the parameter of beta dist list_b_null = np.array([a_null * (1 - mean_null) / mean_null if mean_null != 0 else -1 for a_null, mean_null in zip(list_a_null, list_mean_null)]) # the parameter of beta dist # if the mean and the variance of the null are both zero, the parameters are zero # the null distribution is a delta function at zero # then, the FAP will be NaN # to fix this issue, I manually set the beta distribution which are steap at zero list_a_null[list_both_zero_mean_var_null_index] = 0.01 list_b_null[list_both_zero_mean_var_null_index] = 100 # when mean_null - mean_null **2 - var_null is negative a_null will be negative, which is not allowed in beta distribution # but in this case, mean_null and var_null are both very small value, but there are a few samples with values of importance are large # This means it looks like superposition of two delta functions at 0 and 1 # so I set a_null = 0.01 manually # the re-calculate b_null list_index_negative_a_null = np.where(list_a_null < 0)[0] list_a_null[list_index_negative_a_null] = 0.01 list_b_null[list_index_negative_a_null] = list_a_null[list_index_negative_a_null] * (1 - list_mean_null[list_index_negative_a_null]) / list_mean_null[list_index_negative_a_null] rv_null = beta(list_a_null, list_b_null) faps = 1 - rv_null.cdf(target_mat) return faps
[docs] def calculate_reweighted_importance(self, df1, df_FAP): ''' description: reweight the values of importance using FAP the reweighted importance is defined as rho_new = rho / (FAP + 1) USAGE: df1_new = calculate_reweighted_importance(df1, df_FAP) :param df1: a target samples in pandas frame :param df_FAP: a FAP matrix in pandas frame :return: df1_new: reweighted importance matrix in pandas frame ''' try: channels = np.array(df1.drop(['Unnamed: 0', 'GPS', 'duration'], axis=1).columns.tolist()) except ValueError: channels = np.array(df1.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist()) mat1 = df1[channels].values mat_FAP = df_FAP[channels].values mat1_new = mat1 / (mat_FAP + 1) df1_reweighed = pd.DataFrame(mat1_new, columns=channels) df1_new = df1.drop(channels, axis=1) df1_new = pd.concat([df1_new, df1_reweighed], axis=1) return df1_new
[docs] def perform_Welch_test(self, df_target, df_null, confidence_level, channels=0): ''' description: perform one-sided Welch t-test USAGE: channels, list_t_values_passed, list_t_values_failed, list_Test = perform_Welch_test(df_target, df_null, confidence_level, channels=None) :param df_target: target samples in pandas frame :param df_null: null samples in pandas frame :param confidence_level: a confidence level :param channels: a list of channels, 0 in default :return channels: a list of channels list_t_values_passed: a list of t-values that pass the test list_t_values_failed: a list of t-vlaues that fail the test list_Test: a list of the test results {'pass', 'fail'} ''' # find a list of channels from the target samples if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples channels = self.find_channels(df_target) else: pass list_t_values_passed = np.array([]) list_t_values_failed = np.array([]) list_Test = np.array([]) for channel in channels: if df_null['{}'.format(channel)].std(ddof=1) == 0: # if the variance of the null samples is zero null_samples = np.random.normal(loc=df_null['{}'.format(channel)].mean(), scale=0.01, size=1000) t_value, p_value = self.Student_t_independet_test(df_target['{}'.format(channel)], null_samples) else: t_value, p_value = self.Student_t_independet_test(df_target['{}'.format(channel)], df_null['{}'.format(channel)]) if p_value / 2. < (1 - confidence_level) and t_value > 0: # one-sided Welch t-test list_t_values_passed = np.append(list_t_values_passed, t_value) # fill t-value if this channel passes the test list_t_values_failed = np.append(list_t_values_failed, 0) # fill zero list_Test = np.append(list_Test, 'pass') # fill the mark of pass else: list_t_values_passed = np.append(list_t_values_passed, 0) # fill zero list_t_values_failed = np.append(list_t_values_failed, t_value) # fill t-value if this channel fails the test list_Test = np.append(list_Test, 'fail') # fill the mark of fail return channels, list_t_values_passed, list_t_values_failed, list_Test, confidence_level
[docs] def perform_beta_dist(self, df_target, df_null, channels=0): ''' description: create beta distribution fits for target and null samples USAGE: rv_t_dict, rv_n_dict = perform_beta_dist(df_target, df_null, channels=0) :param df_target: target samples in pandas frame :param df_null: null samples in pandas frame :param channels: channels (optional) :return: rv_t_dict: a dictionary of beta distribution (scipy obj) for the target samples rv_n_dict: a dictionary of beta distribution (scipy obj) for the null samples ''' # find a list of channels from the target samples if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples channels = self.find_channels(df_target) else: pass rv_t_dict = {} rv_n_dict = {} for channel in channels: T = df_target[channel].values BG = df_null[channel].values # for target samples ------ mean_t = T.mean() var_t = T.var(ddof=1) if mean_t != 0 and var_t != 0: a_t = mean_t ** 2 * (1. - mean_t) / var_t - mean_t else: a_t = 0.1 if mean_t != 0: b_t = a_t * (1 - mean_t) / mean_t else: b_t = 100 if a_t < 0: a_t = 0.1 b_t = a_t * (1 - mean_t) / mean_t rv_t = beta(a_t, b_t) # fill it in the dictionary rv_t_dict[channel] = rv_t # for null samples ---------- mean_null = BG.mean() var_null = BG.var(ddof=1) if mean_null != 0 and var_null != 0: a_null = mean_null ** 2 * (1. - mean_null) / var_null - mean_null else: a_null = 0.1 if mean_null != 0: b_null = a_null * (1 - mean_null) / mean_null else: b_null = 100 if a_null < 0: a_null = 0.1 b_null = a_null * (1 - mean_null) / mean_null rv_null = beta(a_null, b_null) # fill it in the dictionary rv_n_dict[channel] = rv_null return rv_t_dict, rv_n_dict
[docs] def perform_p_greater(self, df_target, rv_t_dict, rv_n_dict, confidence_level, channels=0): ''' description: calcualte p_greater for channels note that set p_greater to be 0.5 if the p_greater is not monotonically growing for the target samples USAGE: channels, p_greater_dict, list_p_greater, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level = perform_p_greater(df_target, rv_t_dict, rv_n_dict, confidence_level, channels=0) :param df_target: target samples in pandas frame :param rv_t_dict: a dictionary of beta distribution (scipy obj) for the target samples :param rv_n_dict: a dictionary of beta distribution (scipy obj) for the null samples :param confidence_level: confidence level :param channels: channels :return: channels: list of channels p_greater_dict: a dictionary of p_greater list_p_greater: a list of p_greater list_p_greater_passed: a lisf of p_greater where p_greater is kept if the value greater than confidence level, other wise 0 list_p_greater_failed: a lisf of p_greater where p_greater is kept if the value less than confidence level, other wise 0 list_Test: list of {pas, fail} confidence_level: confidence level ''' # find a list of channels from the target samples if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples channels = self.find_channels(df_target) else: pass p_greater_dict = {} list_p_greater = np.array([]) list_p_greater_passed = np.array([]) list_p_greater_failed = np.array([]) list_Test = np.array([]) for channel in channels: # additional safety: target should be monotonically increase in p_belong dbin = 0.00005 p_belong_safety = [] T = df_target[channel].values for x in [T.min(), T.max()]: if x == 0: x += 0.0001 if x == 1: x -= 0.0001 if T[T != 1].max() < 0.5 * x: # if the x = 1 is very edge of the long tail, pro_fore_belong will be the same as pro_back_belong x = T[T != 1].max() + 0.1 pro_fore_belong = (rv_t_dict[channel].cdf(x + dbin) - rv_t_dict[channel].cdf(x)) * rv_n_dict[ channel].cdf(x) pro_back_belong = (rv_n_dict[channel].cdf(x + dbin) - rv_n_dict[channel].cdf(x)) * rv_t_dict[ channel].cdf(x) if pro_fore_belong == pro_back_belong: p_belong = 0.5 else: p_belong = pro_fore_belong / float((pro_fore_belong + pro_back_belong)) p_belong_safety.append(p_belong) if p_belong_safety[0] < p_belong_safety[1]: # p_belong for the largest x should be greater than that of the minimum x # calculate prob of greater p_greater, error = integrate.quad(lambda x: rv_t_dict[channel].pdf(x) * rv_n_dict[channel].cdf(x), -0.1, 1.1) else: p_greater = 0.5 p_greater_dict[channel] = p_greater list_p_greater = np.append(list_p_greater, p_greater) if p_greater > confidence_level: # one-sided Welch t-test list_p_greater_passed = np.append(list_p_greater_passed, p_greater) # fill t-value if this channel passes the test list_p_greater_failed = np.append(list_p_greater_failed, 0) # fill zero list_Test = np.append(list_Test, 'pass') # fill the mark of pass else: list_p_greater_passed = np.append(list_p_greater_passed, 0) # fill zero list_p_greater_failed = np.append(list_p_greater_failed, p_greater) # fill t-value if this channel fails the test list_Test = np.append(list_Test, 'fail') # fill the mark of fail return channels, p_greater_dict, list_p_greater, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level
[docs] def perform_p_belong(self, df_target, p_greater_dict, rv_t_dict, rv_n_dict, confidence_level, channels=0): ''' description: 1. calculate p_belong for each channel in each frequency in each glitch keep only channels whose p_greater is greater than 0.5 USAGE: list_channels_passed, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level = perform_p_belong(df_target, p_greater_dict, rv_t_dict, rv_n_dict, confidence_level, channels=0) :param df_target: target samples in pandas frame :param p_greater_dict: null samples in pandas frame :param rv_t_dict: a dictionary of beta distribution (scipy obj) for the target samples :param rv_n_dict: a dictionary of beta distribution (scipy obj) for the null samples :param confidence_level: confidecel level :param channels: channels (optional) :return: list_channels_passed: a list of channels whose p_greater is above 0.5 list_GPS: a list of GPS time of the target samples list_duration: list of durations mat_p_belong: a matrix of p_belong where glitch samples are in row and passed channels are in columns mat_Test: a matrix of {pass, fail} where confidence_level: confidence level used ''' # find a list of channels from the target samples if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples channels = self.find_channels(df_target) else: pass # find the list of GPS times of the target samples list_GPS = df_target["GPS"].values # get the list of durations of the target samples list_duration = df_target["duration"].values mat_p_belong = np.array([]) list_channels_passed = np.array([]) for channel in channels: if p_greater_dict[channel] > 0.5: list_channels_passed = np.append(list_channels_passed, channel) # calculate p belong dbin = 0.00005 list_p_belong = np.array([]) T = df_target[channel].values for x in T: if x == 0: x += 0.0001 if x == 1: x -= 0.0001 if T[T != 1].max() < 0.5*x: # if the x = 1 is very edge of the long tail, pro_fore_belong will be the same as pro_back_belong x = T[T != 1].max() + 0.1 pro_fore_belong = (rv_t_dict[channel].cdf(x + dbin) - rv_t_dict[channel].cdf(x)) * rv_n_dict[channel].cdf(x) pro_back_belong = (rv_n_dict[channel].cdf(x + dbin) - rv_n_dict[channel].cdf(x)) * rv_t_dict[channel].cdf(x) if pro_fore_belong == pro_back_belong: p_belong = 0.5 else: p_belong = pro_fore_belong / float((pro_fore_belong + pro_back_belong)) list_p_belong = np.append(list_p_belong, p_belong) list_p_belong = list_p_belong.reshape(list_p_belong.shape[0], 1) mat_p_belong = np.hstack([mat_p_belong, list_p_belong]) if mat_p_belong.size else list_p_belong # find the indices where the faps are less than than 1 - confidence_level which rejects the null hypothesis index_passed = np.where(mat_p_belong > confidence_level) # make a matrix of "pass" or "fail" mat_Test = np.array((["fail"] * mat_p_belong.size)).reshape(mat_p_belong.shape[0], mat_p_belong.shape[-1]) mat_Test[index_passed] = 'pass' return list_channels_passed, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level
[docs] def ranking_channels(self, list_ranking_statistic, list_Test): ''' description: 1. sort based on the value of the ranking statistic 2. sort based on the test with "pass" and "fail", where "pass" comes before "fail" 3. findthe indecies based on the sort 1) and 2) USAGE: list_sorted_base_index_pass_fail, list_sorted_ranking_statistic_pass_fail, list_sorted_Test_pass_fail = ranking_channels(list_ranking_statistic, list_Test) :param list_ranking_statistic: :param list_Test: :return: ''' list_original_indicies = np.arange(0, list_Test.shape[0], 1).astype('int') list_sorted_statistic_indices = np.argsort(list_ranking_statistic)[::-1] # descending order # sort the original indices list_sorted_original_indicies = list_original_indicies[list_sorted_statistic_indices] # sort the ranking statistic list_sorted_ranking_statistic = list_ranking_statistic[list_sorted_statistic_indices] # sort the list of Test list_sorted_Test = list_Test[list_sorted_statistic_indices] # find the indices where the pass comes first and the fail comes later list_pass_indices = np.where(list_sorted_Test == 'pass')[0] list_fail_indicies = np.where(list_sorted_Test == 'fail')[0] list_indices_sorted_pass_fail = np.append(list_pass_indices, list_fail_indicies) # sort the original indices list_sorted_base_index_pass_fail = list_sorted_original_indicies[list_indices_sorted_pass_fail] # sort the ranking statistic list_sorted_ranking_statistic_pass_fail = list_sorted_ranking_statistic[list_indices_sorted_pass_fail] # sort the list of Test list_sorted_Test_pass_fail = list_sorted_Test[list_indices_sorted_pass_fail] # assign rank # ... return list_sorted_base_index_pass_fail, list_sorted_ranking_statistic_pass_fail, list_sorted_Test_pass_fail
[docs] def Plot_Welch_t_test(self, channels, list_t_values_passed, list_t_values_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands): ''' description: plot the result of one-sided Welch t-test USAGE: Plot_Welch_t_test(channels, list_t_values_passed, list_t_values_failed, list_Test, output_dir, output_file) :param channels: a list of channels :param list_t_values_passed: a list of t-values that pass the test :param list_t_values_failed: a list of t-values that fail the test :param list_Test: a list of the test results {'pass', 'fail'} :param confidence_level: a confidence level :param output_dir: an output directory :param output_file: an output file name :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py :return: None ''' CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels) # color the background either with black or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == list_t_values_passed.shape[-1] - 1: # fix the background color to match plot size edgeRight = edgeRight + 1 ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) ax.set_edgecolor(None) # ++++++++ plot bar plot about t-values that pass or fail Welch t-test ++++++++++++++++ plt.bar(np.where(list_t_values_failed != 0)[0], list_t_values_failed[list_t_values_failed != 0], 1, align='edge', edgecolor=None, color='lightblue',alpha=0.3, linewidth=0.) plt.bar(np.where(list_t_values_passed != 0)[0], list_t_values_passed[list_t_values_passed != 0], 1, align='edge', edgecolor=None, color='blue', linewidth=0.) # +++++++++++ plot a table of ranking channels +++++++++++++++++++ # a list of the probability of the causality list_t_values = list_t_values_passed + list_t_values_failed # list of the error of the probability of the causality # list_causal_err = list_causal_passed_err + list_causal_failed_err # sort channel based on the probability of the causality if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP# # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All" list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands]) # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ... # A list of the corresponding frequency bands are created # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...] # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ... list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F') # get rid of the BD# at the end of the channel name to make it original name channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels]) # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]','t-value', 'Test{}'.format(confidence_level))) # define the table size ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15] else: list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 't-value', 'Test{}'.format(confidence_level))) # defind the table size ColWidths = [0.05, 0.6, 0.1, 0.15] # add rank label Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1)) # round importance Prodf['t-value'] = Prodf['t-value'].astype(float).round(3) # = df.value1.round() # re-sort columns to make rank labels coming first cols = Prodf.columns.tolist() cols = cols[-1:] + cols[:-1] Prodf = Prodf[cols] # take the first 5 high ranking channels dcsummary = Prodf.iloc[:5] # plot table on the top of the bar plot table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths) table.auto_set_font_size(False) table.set_fontsize(6) # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++ # ++++++++++++++ alter ticks +++++++++++++++++ # plot xticks using the dominant sub sensor names plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)]) # put ticks above plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10) plt.tick_params(axis="y", labelsize=10) plt.grid(False) plt.xlim(0, ListSubsys.shape[0]) # set axis label plt.ylabel('t-value', fontsize=15) plt.xlabel('Auxiliary channel', fontsize=15) plt.tight_layout(rect=[0, 0.03, 1, 0.8]) # save a plot output_path = os.path.join(output_dir, output_file) plt.savefig(output_path) plt.close() print('Completed Saving a plot of t-value VS channel') return None
[docs] def Table_Welch_t_test(self, channels, list_t_values_passed, list_t_values_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands): ''' description: save the result of one-sided Welch t-test as a .csv file USAGE: Table_Welch_t_test(channels, list_t_values_passed, list_t_values_failed, list_Test, output_dir, output_file) :param channels: a list of channels :param list_t_values_passed: a list of t-values that pass the test :param list_t_values_failed: a list of t-values that fail the test :param list_Test: a list of the test results {'pass', 'fail'} :param confidence_level: a confidence level :param output_dir: an output directory :param output_file: an output file name :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py :return: None ''' # a list of the t-values list_t_values = list_t_values_passed + list_t_values_failed if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP# # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All" list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands]) # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ... # A list of the corresponding frequency bands are created # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...] # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ... list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F') # get rid of the BD# at the end of the channel name to make it original name channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels]) # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]','t-value', 'Test{}'.format(confidence_level))) # define the table size else: # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test) RankChannel = list(zip(list_sorted_base_index_pass_fail, channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 't-value', 'Test{}'.format(confidence_level))) # add rank label Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1)) # round importance Prodf['t-value'] = Prodf['t-value'].astype(float).round(3) # = df.value1.round() # re-sort columns to make rank labels coming first cols = Prodf.columns.tolist() cols = cols[-1:] + cols[:-1] Prodf = Prodf[cols] output_path = os.path.join(output_dir, output_file) # save it Prodf.to_csv(output_path, index=False) print('Completed saving a table of the t-value of channels') return None
################
[docs] def perform_point_chisqr_test(self, df_target, df_null, confidence_level, channels=0): ''' description: perform a single point chi-square test for each channel at each glitch USAGE: channels, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, p_values, confidence_level= perform_point_chisqr_test(df_target, df_null, confidence_level, channels) :param df_target: target samples in pandas frame :param df_null: null samples in pandas frame :param confidence_level: a confidence level :param channels: a list of channels, 0 in default :return channels: a list of channels in numpy array list_GPS: a list of GPS times in numpy array list_duration: a list of durations in numpy array mat_chsqr_passed: a matrix of "passed" chi-square values where glitch indices are in rows and channels are columns in numpy array note that channels in glitches that passed the test have non-zero values, otherwise zero mat_chsqr_failed: a matrix of "failed" chi-square values where glitch indices are in rows and channels are columns in numpy array note that channels in glitches that failed the test have non-zero values, otherwise zero mat_Test: a list of the test results {'pass', 'fail'} confidence_level: a value of user defined confidence level that is used for the test p_values: a matrix of p-values where glitch indices are in rows and channels are columns in numpy array ''' # find the list of channels if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples channels = self.find_channels(df_target) else: pass # find the list of GPS times of the target samples list_GPS = df_target["GPS"].values # get the list of durations of the target samples list_duration = df_target["duration"].values # calculate values of chi-square and corresponding p-values chsqr_values, p_values = self.chisquare_test(df_target[channels].values, df_null[channels].values) # make a matrix with all the elements equal to zero where its shape is the same as the chisqr_values mat_chsqr_passed = np.zeros((chsqr_values.shape[0], chsqr_values.shape[-1])) # find the indices where the p_values are less than than 1 - confidence_level which rejects the null hypothesis index_passed = np.where(p_values < (1 - confidence_level)) # replace the values of the elements with chi-square values where the elements rejects the null hypothesis # note the elements which failed to reject the null hypothesis is set to be zero mat_chsqr_passed[index_passed] = chsqr_values[index_passed] # make a matrix of chi-squares that fail to reject the null hypothesis # note it elements which reject the null hypothesis set to be zero mat_chsqr_failed = chsqr_values - mat_chsqr_passed # make a matrix of "pass" or "fail" mat_Test = np.array((["fail"] * chsqr_values.size)).reshape(chsqr_values.shape[0], chsqr_values.shape[-1]) mat_Test[index_passed] = 'pass' return channels, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, p_values, confidence_level
[docs] def Plot_point_chisqur_test(self, channels_band, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, p_values, confidence_level, output_dir, output_file=None, freq_bands=Const.freq_bands): ''' description: plot the result of point chi-square test USAGE: Plot_point_chisqur_test(channels, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands) :param channels_band: a list of channels in numpy array :param list_GPS: a list of GPS times in numpy array :param list_duration: a list of durations in numpy array :param mat_chsqr_passed: a matrix of "passed" chi-square values where glitch indices are in rows and channels are columns in numpy array note that channels in glitches that passed the test have non-zero values, otherwise zero :param mat_chsqr_failed: a matrix of "failed" chi-square values where glitch indices are in rows and channels are columns in numpy array note that channels in glitches that failed the test have non-zero values, otherwise zero :param mat_Test: a list of the test results {'pass', 'fail'} :param p_values: a matrix of p-values where glitch indices are in rows and channels are columns in numpy array :param output_dir: an output directory :param output_file: an output file name :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py :return: None ''' for GPS, duration, list_chsqr_passed, list_chsqr_failed, list_Test in zip(list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test): CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels_band) # color the background either with black or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == list_chsqr_passed.shape[-1] - 1: # fix the background color to match plot size edgeRight = edgeRight + 1 ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) ax.set_edgecolor(None) # ++++++++ plot bar plot about t-values that pass or fail Welch t-test ++++++++++++++++ plt.bar(np.where(list_chsqr_failed != 0)[0], list_chsqr_failed[list_chsqr_failed != 0], 1, align='center', edgecolor=None, color='lightblue',alpha=0.3, linewidth=0.) plt.bar(np.where(list_chsqr_passed != 0)[0], list_chsqr_passed[list_chsqr_passed != 0], 1, align='center', edgecolor=None, color='blue', linewidth=0.) # +++++++++++ plot a table of ranking channels +++++++++++++++++++ # a list of values of chi-square list_chsqr = list_chsqr_passed + list_chsqr_failed # sort channel based on the probability of the causality if channels_band[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP# # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All" list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands]) # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ... # A list of the corresponding frequency bands are created # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...] # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ... list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels_band.shape[0], axis=1).flatten(order='F') # get rid of the BD# at the end of the channel name to make it original name channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels_band]) # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_chsqr, list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_chsqr[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'chi-square', 'Test{}'.format(confidence_level))) # define the table size ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15] else: # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_chsqr, list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_chsqr[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'chi-square', 'Test{}'.format(confidence_level))) # defind the table size ColWidths = [0.05, 0.6, 0.1, 0.15] # add rank label Prodf = Prodf.assign(rank=np.arange(1, list_chsqr.shape[-1] + 1, 1)) # round importance Prodf['chi-square'] = Prodf['chi-square'].astype(float).round(3) # = df.value1.round() # re-sort columns to make rank labels coming first cols = Prodf.columns.tolist() cols = cols[-1:] + cols[:-1] Prodf = Prodf[cols] # take the first 5 high ranking channels dcsummary = Prodf.iloc[:5] # plot table on the top of the bar plot table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths) table.auto_set_font_size(False) table.set_fontsize(6) # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++ # ++++++++++++++ alter ticks +++++++++++++++++ # plot xticks using the dominant sub sensor names plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)]) # put ticks above plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10) plt.tick_params(axis="y", labelsize=10) plt.grid(False) plt.xlim(0, ListSubsys.shape[0]) # make title plt.text(channels.shape[0]*0.5, (list_chsqr_passed + list_chsqr_failed).max()*1.4, 'GPS = {GPS:.02f}, duration = {duration:.3f} s'.format(GPS=GPS, duration=duration), ha='center', va='center', fontsize=10) plt.ylabel('Chi-square', fontsize=15) plt.xlabel('Auxiliary channel', fontsize=15) plt.tight_layout(rect=[0, 0.03, 1, 1]) # save a plot if output_file == None: output_path = os.path.join(output_dir, '{}_chisqr.pdf'.format(str(GPS).replace('.', '-'))) else: output_path = os.path.join(output_dir, output_file) plt.savefig(output_path) plt.close() print('Completed Saving a plot of chi-square VS channel of a glitch at {}'.format(GPS)) #------------------+ # save the table | #------------------+ # for chi-squares mat_chsqr = mat_chsqr_passed + mat_chsqr_failed data_GPSduration = np.hstack((list_GPS.reshape(list_GPS.shape[0], 1), list_duration.reshape(list_duration.shape[0], 1))) data_chisqr = np.hstack((data_GPSduration, mat_chsqr)) columns_chisqr = np.append(np.array(["GPS", "duration"]), channels_band) df_chisqr = pd.DataFrame(data_chisqr, columns=columns_chisqr) df_chisqr.to_csv(os.path.join(output_dir, 'chisqr.csv')) print("chi-square table is saved") # for p-values data_pvalue = np.hstack((data_GPSduration, p_values)) df_chisqr = pd.DataFrame(data_pvalue, columns=columns_chisqr) df_chisqr.to_csv(os.path.join(output_dir, 'p_values.csv')) print("p-value table is saved") return None
[docs] def perform_fap(self, df_target, df_null, confidence_level, channels=0): ''' description: calculate a FAP for each channel at each glitch USAGE: channels, list_GPS, list_duration, mat_fap, mat_Test, confidence_level= perform_fap(df_target, df_null, confidence_level, channels) :param df_target: target samples in pandas frame :param df_null: null samples in pandas frame :param confidence_level: a confidence level :param channels: a list of channels, 0 in default :return channels: a list of channels in numpy array list_GPS: a list of GPS times in numpy array list_duration: a list of durations in numpy array mat_fap: matrix of fap mat_Test: a list of the test results {'pass', 'fail'} confidence_level: a value of user defined confidence level that is used for the test ''' # find the list of channels if type(channels) == int: # if channels are not provided, it uses all the channels in the target samples channels = self.find_channels(df_target) else: pass # find the list of GPS times of the target samples list_GPS = df_target["GPS"].values # get the list of durations of the target samples list_duration = df_target["duration"].values # calculate values of FAP mat_fap = self.calculate_fap(df_target[channels].values, df_null[channels].values) # find the indices where the faps are less than than 1 - confidence_level which rejects the null hypothesis index_passed = np.where(mat_fap < (1 - confidence_level)) # make a matrix of "pass" or "fail" mat_Test = np.array((["fail"] * mat_fap.size)).reshape(mat_fap.shape[0], mat_fap.shape[-1]) mat_Test[index_passed] = 'pass' return channels, list_GPS, list_duration, mat_fap, mat_Test, confidence_level
[docs] def Plot_fap(self, channels_band, list_GPS, list_duration, mat_fap, mat_Test, confidence_level, output_dir, output_file=None, freq_bands=Const.freq_bands): ''' description: plot the result of point chi-square test USAGE: Plot_point_chisqur_test(channels, list_GPS, list_duration, mat_fap, mat_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands) :param channels_band: a list of channels in numpy array :param list_GPS: a list of GPS times in numpy array :param list_duration: a list of durations in numpy array :param mat_fap: matrix of fap :param mat_Test: a list of the test results {'pass', 'fail'} :param output_dir: an output directory :param output_file: an output file name :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py :return: None ''' for GPS, duration, list_fap, list_Test in zip(list_GPS, list_duration, mat_fap, mat_Test): CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels_band) # color the background either with black or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == list_fap.shape[-1] - 1: # fix the background color to match plot size edgeRight = edgeRight + 1 ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) ax.set_edgecolor(None) # ++++++++ plot bar plot about - log10(fap) ++++++++++++++++ list_fap_for_plot = list_fap # for the plottting perpose, 1 / fap when fap = 0 will be infinit so that I replace the fap = 0 to be the maximum fap in the mat_fap list_fap_for_plot[list_fap_for_plot == 0] = mat_fap[mat_fap != 0].min() plt.bar(np.where(list_fap_for_plot >= (1 - confidence_level))[0], 1 / list_fap_for_plot[list_fap_for_plot >= (1 - confidence_level)], 1, align='center', edgecolor=None, color='lightblue', alpha=0.3, linewidth=0.) plt.bar(np.where(list_fap_for_plot < (1 - confidence_level))[0], 1 / list_fap_for_plot[list_fap_for_plot < (1 - confidence_level)], 1, align='center', edgecolor=None, color='blue', linewidth=0.) # +++++++++++ plot a table of ranking channels +++++++++++++++++++ # # a list of values of fap # list_fap = faps # sort channel based on the probability of the causality if channels_band[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP# # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All" list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands]) # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ... # A list of the corresponding frequency bands are created # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...] # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ... list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels_band.shape[0], axis=1).flatten(order='F') # get rid of the BD# at the end of the channel name to make it original name channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels_band]) # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(-np.log10(list_fap), list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_fap[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'FAP', 'Test{}'.format(confidence_level))) # define the table size ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15] else: # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(-np.log10(list_fap), list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_fap[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'FAP', 'Test{}'.format(confidence_level))) # defind the table size ColWidths = [0.05, 0.6, 0.1, 0.15] # add rank label Prodf = Prodf.assign(rank=np.arange(1, list_fap.shape[-1] + 1, 1)) # round importance Prodf['FAP'] = ['{:.2e}'.format(fap) for fap in Prodf['FAP'].astype(float)] # re-sort columns to make rank labels coming first cols = Prodf.columns.tolist() cols = cols[-1:] + cols[:-1] Prodf = Prodf[cols] # take the first 5 high ranking channels dcsummary = Prodf.iloc[:5] # plot table on the top of the bar plot table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths) table.auto_set_font_size(False) table.set_fontsize(6) # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++ # ++++++++++++++ alter ticks +++++++++++++++++ # plot xticks using the dominant sub sensor names plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)]) # put ticks above plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10) plt.tick_params(axis="y", labelsize=10) plt.grid(False) plt.xlim(0, ListSubsys.shape[0]) # make title plt.text(channels.shape[0]*0.5, (1/list_fap_for_plot).max()**1.4, 'GPS = {GPS:.02f}, duration = {duration:.3f} s'.format(GPS=GPS, duration=duration), ha='center', va='center', fontsize=10) plt.ylim(ymin=1) plt.ylabel('1/FAP', fontsize=15) plt.xlabel('Auxiliary channel', fontsize=15) plt.yscale('log') #plt.tight_layout(rect=[0, 0.03, 1, 0.8]) plt.tight_layout(rect=[0, 0.03, 1, 1]) # save a plot if output_file == None: output_path = os.path.join(output_dir, '{}_fap.pdf'.format(str(GPS).replace('.', '-'))) else: output_path = os.path.join(output_dir, output_file) plt.savefig(output_path) plt.close() print('Completed Saving a plot of FAP VS channel of a glitch at {}'.format(GPS)) return None
[docs] def Save_fap_csv(self, channels, list_GPS, list_duration, mat_fap, output_dir): ''' description: save the FAP table as csv file USAGE: Save_fap_csv(channels, list_GPS, list_duration, mat_fap, output_dir) :param channels: a list of channels :param list_GPS: a list of GPS times :param list_duration: a list of durations :param mat_fap: a matrix of FAP of each channel for each glitch :param output_dir: output directory :return: None ''' #------------------+ # save the table | #------------------+ # for fap data_GPSduration = np.hstack((list_GPS.reshape(list_GPS.shape[0], 1), list_duration.reshape(list_duration.shape[0], 1))) data_fap = np.hstack((data_GPSduration, mat_fap)) columns_fap = np.append(np.array(["GPS", "duration"]), channels) df_fap = pd.DataFrame(data_fap, columns=columns_fap) df_fap.to_csv(os.path.join(output_dir, 'fap.csv')) print("FAP table is saved") return None
[docs] def Plot_p_greater(self, channels, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands): ''' description: plot the result of p_greater USAGE: Plot_p_greater(channels, list_p_greater_passed, list_p_greater_failed, list_Test, output_dir, output_file) :param channels: a list of channels :param list_p_greater_passed: a list of p_greater above confidence level :param list_p_greater_failed: a list of p_greater that fail the test :param list_Test: a list of the test results {'pass', 'fail'} :param confidence_level: a confidence level :param output_dir: an output directory :param output_file: an output file name :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py :return: None ''' CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels) # color the background either with black or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == list_p_greater_passed.shape[-1] - 1: # fix the background color to match plot size edgeRight = edgeRight + 1 ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) ax.set_edgecolor(None) # ++++++++ plot bar plot about t-values that pass or fail Welch t-test ++++++++++++++++ plt.bar(np.where(list_p_greater_failed != 0)[0], list_p_greater_failed[list_p_greater_failed != 0], 1, align='edge', edgecolor=None, color='lightblue',alpha=0.3, linewidth=0.) plt.bar(np.where(list_p_greater_passed != 0)[0], list_p_greater_passed[list_p_greater_passed != 0], 1, align='edge', edgecolor=None, color='red', linewidth=0.) # +++++++++++ plot a table of ranking channels +++++++++++++++++++ # a list of the probability of the causality list_t_values = list_p_greater_passed + list_p_greater_failed # list of the error of the probability of the causality # list_causal_err = list_causal_passed_err + list_causal_failed_err # sort channel based on the probability of the causality # name of the probablity name_p_greater = r'$p_g$' if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP# # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All" list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands]) # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ... # A list of the corresponding frequency bands are created # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...] # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ... list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F') # get rid of the BD# at the end of the channel name to make it original name channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels]) # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', name_p_greater, 'Threshold {}'.format(confidence_level))) # define the table size ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15] else: list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', name_p_greater, 'Threshold {}'.format(confidence_level))) # defind the table size ColWidths = [0.05, 0.6, 0.1, 0.15] # add rank label Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1)) # round importance Prodf[name_p_greater] = Prodf[name_p_greater].astype(float).round(5) # = df.value1.round() #Prodf['p_greater'] = Prodf['p_greater'].astype(float).round(5) # = df.value1.round() # re-sort columns to make rank labels coming first cols = Prodf.columns.tolist() cols = cols[-1:] + cols[:-1] Prodf = Prodf[cols] # take the first 5 high ranking channels dcsummary = Prodf.iloc[:5] # plot table on the top of the bar plot table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths) table.auto_set_font_size(False) table.set_fontsize(6) # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++ # ++++++++++++++ alter ticks +++++++++++++++++ # plot xticks using the dominant sub sensor names plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)]) # put ticks above plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10) plt.tick_params(axis="y", labelsize=10) plt.grid(False) plt.xlim(0, ListSubsys.shape[0]) # set axis label #plt.ylabel(r'p_greater', fontsize=15) plt.ylabel(r'$p_g$', fontsize=15) plt.xlabel('Auxiliary channel', fontsize=15) plt.tight_layout(rect=[0, 0.03, 1, 1]) # save a plot output_path = os.path.join(output_dir, output_file) plt.savefig(output_path) plt.close() print('Completed Saving a plot of p_greater VS channel') return None
[docs] def Table_p_greater(self, channels, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands): ''' description: save the result of p_greater as a .csv file USAGE: Table_Welch_t_test(channels, list_p_greater_passed, list_p_greater_failed, list_Test, output_dir, output_file) :param channels: a list of channels :param list_p_greater_passed: a list of p_greater that pass the test :param list_p_greater_failed: a list of p_greater that fail the test :param list_Test: a list of the test results {'pass', 'fail'} :param confidence_level: a confidence level :param output_dir: an output directory :param output_file: an output file name :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py :return: None ''' # a list of the t-values list_t_values = list_p_greater_passed + list_p_greater_failed if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP# # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All" list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands]) # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ... # A list of the corresponding frequency bands are created # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...] # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ... list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F') # get rid of the BD# at the end of the channel name to make it original name channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels]) # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'p_greater', 'Test{}'.format(confidence_level))) # define the table size else: # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test) RankChannel = list(zip(list_sorted_base_index_pass_fail, channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 'p_greater', 'Test{}'.format(confidence_level))) # add rank label Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1)) # round importance Prodf['p_greater'] = Prodf['p_greater'].astype(float).round(6) # = df.value1.round() # re-sort columns to make rank labels coming first cols = Prodf.columns.tolist() cols = cols[-1:] + cols[:-1] Prodf = Prodf[cols] output_path = os.path.join(output_dir, output_file) # save it Prodf.to_csv(output_path, index=False) print('Completed saving a table of the p_greater of channels') return None
[docs] def Save_p_belong_csv(self, channels_passed, list_GPS, list_duration, mat_p_belong, output_dir): ''' description: save the p_belong table as csv file channels_passed is the list of channels whose p_greater is above 0.5 to avoid misinterpretation of p_belong. USAGE: Save_p_belong_csv(channels_passed, list_GPS, list_duration, mat_p_belong, output_dir) :param channels_passed: a list of channels :param list_GPS: a list of GPS times :param list_duration: a list of durations :param mat_p_belong: a matrix of p_belong of each channel for each glitch :param output_dir: output directory :return: None ''' #------------------+ # save the table | #------------------+ # for fap data_GPSduration = np.hstack((list_GPS.reshape(list_GPS.shape[0], 1), list_duration.reshape(list_duration.shape[0], 1))) data_p_belong = np.hstack((data_GPSduration, mat_p_belong)) columns_p_belong = np.append(np.array(["GPS", "duration"]), channels_passed) df_p_belong = pd.DataFrame(data_p_belong, columns=columns_p_belong) df_p_belong.to_csv(os.path.join(output_dir, 'p_belong.csv')) print("p_belong table is saved") return None
[docs] def Plot_p_belong(self, channels_band, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level, output_dir, output_file=None, freq_bands=Const.freq_bands): ''' description: plot the result of p_belong USAGE: Plot_p_belong(channels, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands) :param channels: a list of channels in numpy array :param list_GPS: a list of GPS times in numpy array :param list_duration: a list of durations in numpy array :param mat_p_belong: matrix of p_belong :param mat_Test: a list of the test results {'pass', 'fail'} :param output_dir: an output directory :param output_file: an output file name :param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py :return: None ''' for GPS, duration, list_p_belong, list_Test in zip(list_GPS, list_duration, mat_p_belong, mat_Test): CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels_band) # color the background either with black or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == list_p_belong.shape[-1] - 1: # fix the background color to match plot size edgeRight = edgeRight + 1 ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) ax.set_edgecolor(None) # for the plottting perpose, 1 / fap when fap = 0 will be infinit so that I replace the fap = 0 to be the maximum fap in the mat_fap plt.bar(np.where(list_p_belong <= confidence_level)[0], list_p_belong[list_p_belong <= confidence_level], 1, align='center', edgecolor=None, color='lightblue', alpha=0.3, linewidth=0.) plt.bar(np.where(list_p_belong > confidence_level)[0], list_p_belong[list_p_belong > confidence_level], 1, align='center', edgecolor=None, color='blue', linewidth=0.) # +++++++++++ plot a table of ranking channels +++++++++++++++++++ # # a list of values of fap # list_fap = faps # sort channel based on the probability of the causality if channels_band[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP# # make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All" list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands]) # As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ... # A list of the corresponding frequency bands are created # Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...] # Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ... #list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels_band.shape[0], axis=1).flatten(order='F') # Because for plotting p_belong, only channels that passed p_greater are saved in the matrix list_freq_bands = np.array([list_freq_bands[int(ch.split('BP')[-1])] for ch in channels_band]) # get rid of the BD# at the end of the channel name to make it original name channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels_band]) # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_p_belong, list_Test) RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_p_belong[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'p_belong', 'Test{}'.format(confidence_level))) # define the table size ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15] else: # sort the links based on values of statistic list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_p_belong, list_Test) RankChannel = list(zip(channels_band[list_sorted_base_index_pass_fail], list_p_belong[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail])) # make pandas matrix Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'p_belong', 'Test{}'.format(confidence_level))) # defind the table size ColWidths = [0.05, 0.6, 0.1, 0.15] # add rank label Prodf = Prodf.assign(rank=np.arange(1, list_p_belong.shape[-1] + 1, 1)) # round importance Prodf['p_belong'] = ['{:.4f}'.format(fap) for fap in Prodf['p_belong'].astype(float)] # re-sort columns to make rank labels coming first cols = Prodf.columns.tolist() cols = cols[-1:] + cols[:-1] Prodf = Prodf[cols] # take the first 5 high ranking channels dcsummary = Prodf.iloc[:5] # plot table on the top of the bar plot table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths) table.auto_set_font_size(False) table.set_fontsize(6) # ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++ # ++++++++++++++ alter ticks +++++++++++++++++ # plot xticks using the dominant sub sensor names plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)]) # put ticks above plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10) plt.tick_params(axis="y", labelsize=10) plt.grid(False) plt.xlim(0, ListSubsys.shape[0]) # make title plt.text(channels.shape[0]*0.5, list_p_belong.max()*1.4, 'GPS = {GPS:.02f}, duration = {duration:.3f} s'.format(GPS=GPS, duration=duration), ha='center', va='center', fontsize=10) plt.ylabel('p_belong', fontsize=15) plt.xlabel('Auxiliary channel', fontsize=15) #plt.tight_layout(rect=[0, 0.03, 1, 0.8]) plt.tight_layout(rect=[0, 0.03, 1, 1]) # save a plot if output_file == None: output_path = os.path.join(output_dir, '{}_p_belong.pdf'.format(str(GPS).replace('.', '-'))) else: output_path = os.path.join(output_dir, output_file) plt.savefig(output_path) plt.close() print('Completed Saving a plot of p_belong VS channel of a glitch at {}'.format(GPS)) return None
################### # def Determine_number_of_subclass(self, Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence): # ''' # description: # 1. query the clustered target samples # 2. query null samples # 3. perform one-sided binomial test and one-sided Welch t-test on each subclass # 4. count the number of subclasses which has at least one channel passing the both tests # USAGE: num_subclass = Determine_number_of_subclass(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence) # # :param Path_Target_Glitch_SubClassClustered_Dataset: a path to the clustered target samples # :param Path_Null_Dataset: a path to null samples # :param test_confidence: a statistical confidence level # :return: num_subclass: the number of subclasses which has at least one channel passing the both tests # ''' # num_subclass = 0 # set it zero # # query the clustered target samples and null samples # df_target, df_null = self.query_targetglitch_null(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset) # # get a list of subclass labels (non-duplicated) # SubClass_labels = df_target.SubclassLabel.unique().astype('int') # for Sub_class_Label in SubClass_labels: # iterate though subclasses # df_target_sub = df_target[df_target.SubclassLabel == Sub_class_Label].drop(['SubclassLabel'], axis=1) # # binomial test # list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_binomi_Test, list_channels = self.find_meaing_ful_confidence(df_target_sub, df_null, test_confidence) # # Welch t-test # channels, list_t_values_passed, list_t_values_failed, list_t_Test, confidence_level_used = self.perform_Welch_test(df_target_sub, df_null, test_confidence) # if list_binomi_Test[list_binomi_Test == 'pass'].shape[0] > 0 and list_t_Test[list_t_Test == 'pass'].shape[0] > 0: # if there is a least one channel that pass the both tests # num_subclass += 1 # count 1 # return num_subclass
[docs] def Determine_number_of_subclass(self, Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence): ''' description: 1. query the clustered target samples 2. query null samples 3. perform one-sided binomial test and one-sided Welch t-test on each subclass 4. count the number of subclasses which has at least one channel passing the both tests USAGE: num_subclass = Determine_number_of_subclass(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence) :param Path_Target_Glitch_SubClassClustered_Dataset: a path to the clustered target samples :param Path_Null_Dataset: a path to null samples :param test_confidence: a statistical confidence level :return: num_subclass: the number of subclasses which has at least one channel passing the both tests ''' num_subclass = 0 # set it zero # query the clustered target samples and null samples df_target, df_null = self.query_targetglitch_null(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset) # estimate beta distributions rv_t_dict, rv_n_dict = self.perform_beta_dist(df_target, df_null) # get a list of subclass labels (non-duplicated) SubClass_labels = df_target.SubclassLabel.unique().astype('int') for Sub_class_Label in SubClass_labels: # iterate though subclasses df_target_sub = df_target[df_target.SubclassLabel == Sub_class_Label].drop(['SubclassLabel'], axis=1) channels, p_greater_dict, list_p_greater, list_p_greater_passed, list_p_greater_failed, list_p_greater_Test, confidence_level_used = self.perform_p_greater(df_target_sub, rv_t_dict, rv_n_dict, test_confidence) if list_p_greater_Test[list_p_greater_Test == 'pass'].shape[0] > 0: # if there is a least one channel that pass the both tests num_subclass += 1 # count 1 # binomial test # list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_binomi_Test, list_channels = self.find_meaing_ful_confidence(df_target_sub, df_null, test_confidence) # Welch t-test # channels, list_t_values_passed, list_t_values_failed, list_t_Test, confidence_level_used = self.perform_Welch_test(df_target_sub, df_null, test_confidence) #if list_binomi_Test[list_binomi_Test == 'pass'].shape[0] > 0 and list_t_Test[list_t_Test == 'pass'].shape[0] > 0: # if there is a least one channel that pass the both tests # num_subclass += 1 # count 1 return num_subclass
[docs] def Get_MatCountGPSDuration(self, Path_Target_Glitch_SubClassClustered_Dataset): ''' description: get a importance matrix, a list of channels, a list of GPS times, and a list of durations from the clustered target samples :param Path_Target_Glitch_SubClassClustered_Dataset: a path to the clustered target samples :return: MatCount: a importance matrix ListChannelName: a list of channels ListGPS: a list of GPS times ListDuration: a list of durations ''' df = pd.read_csv(Path_Target_Glitch_SubClassClustered_Dataset) try: ListChannelName = np.array(df.drop(['Unnamed: 0', 'SubclassLabel', 'GPS', 'duration'], axis=1).columns.tolist()) except ValueError: ListChannelName = np.array(df.drop(['Unnamed: 0', 'SubclassLabel', 'GPS'], axis=1).columns.tolist()) ListGPS = df['GPS'].values ListDuration = df['duration'].values MatCount = df.drop(['Unnamed: 0', 'SubclassLabel', 'GPS', 'duration'], axis=1).values return MatCount, ListChannelName, ListGPS, ListDuration
[docs] def Principal_component_analysis(self, MatCount, frac_component=0.9): ''' description: PCA decomposition applided to feature matrix USAGE: MatCount, MatCount_inverse_pca = Principal_component_analysis(self, MatCount, frac_component=0.9) :param MatCount: a feature matrix of glitches with samples in rows and features in columns :param frac_component: a cumulative variance :return: MatCount_pca: a feature matix in PCA space MatCount_inverse_pca: a reconstructed feature matrix in the original space ''' pca = PCA() pca.fit(MatCount) cum_variance = np.cumsum(pca.explained_variance_ratio_) value_cut = cum_variance[cum_variance > frac_component][0] index_cut = np.where(cum_variance== value_cut)[0][0] pca = PCA(n_components=index_cut) MatCount_pca = pca.fit_transform(MatCount) MatCount_inverse_pca = pca.inverse_transform(MatCount_pca[:, :index_cut]) # plt.imshow(MatCount, aspect='auto') # plt.tight_layout() # plt.savefig('orginal.png') # plt.close() # # plt.imshow(MatCount_pca, aspect='auto') # plt.savefig('pca.png') # plt.close() # # plt.imshow(MatCount_inverse_pca, aspect='auto',) # plt.savefig('inverse_pca.png') # plt.close() return MatCount_pca, MatCount_inverse_pca
[docs] def FindSubClass(self, MatCount, ListChannelName, ListGPS, ListDuration, output_dir, upper_number_cluster, applied_Transformation): ''' description: 1. use a clustering approach 2. plot glitch index VS channel grouped by clusters 3. make a table comprising a list of GPS times a given cluster 4. plot Importance VS channel of a given sub-class 5. make a corresponding table USAGE: FindSubClass(MatCount, ListChannelName, ListGPS, output_dir) :param MatCount: a matrix comprising importance versus channels :param ListChannelName: a list of channel names :param ListGPS: a list of GPS times :param ListDuration: a list of durations :param output_dir: :param upper_number_cluster: a value of upper limit of the number of clusters :param applied_Transformation: decomposition applided {'PCA', 'kernelPCA', 'None'} :return: None ''' if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful print('There is only one single glitch. Skip finding sub-classes') return None # limit the upper_number_cluster as it should be below a number of the sample if MatCount.shape[0] < upper_number_cluster: upper_number_cluster = MatCount.shape[0] print('Finding sub-classes by Gaussian mixture clustering') #-------------------------------------------------+ # use Gaussian mixture model to get sub-classes | #-------------------------------------------------+ # PCA or kernel PCA applied or not if applied_Transformation == 'PCA': MatCount_pca, MatCount_inverse_pca = self.Principal_component_analysis(MatCount) X = MatCount_pca elif applied_Transformation == 'None': X = MatCount lowest_bic = np.infty bic = [] n_components_range = range(1, upper_number_cluster+1) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm clf = best_gmm Y_ = clf.predict(X) # label of the clusters ListUniqueLable = np.unique(Y_) # unique labels DictMatCount = {} # a dictionary having matrices for a given cluster label for UniqueLabel in ListUniqueLable: indecies = np.where(Y_ == UniqueLabel)[0] ClusteredMatCount = MatCount[indecies] DictMatCount[str(UniqueLabel)] = ClusteredMatCount print('{} sub-classes have been found'.format(len(DictMatCount))) #----------------------------------------------------------+ # plot of occurrence VS channel grouped by sub-classes | #----------------------------------------------------------+ # color the background either with blakc or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 # get xticks for dominant channels CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannelName) fig = plt.figure() # total grid of sub-plots firstratio = int(0.1 * MatCount.shape[0]) # if the number of sample is less than 10, firstratio becomes 0. To cope with this issue, I manually replace it with 1 if firstratio == 0: firstratio = 1 totalrow = MatCount.shape[0] + firstratio # make grid for subplots gs = GridSpec(totalrow, 20) # a subplot showing channel position ax0 = fig.add_subplot(gs[:firstratio, :-1]) # First row, first column for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if edgeRight == MatCount.shape[-1] - 1: # adjust the rightest color span edgeRight = edgeRight + 1 ax0.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) # limit the xaxis of the first subplot ax0.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001) - 1 + 0.5) # get rid of ticks of the first plot ax0.yaxis.set_major_locator(plt.NullLocator()) ax0.xaxis.set_major_locator(plt.NullLocator()) ax0.set_xticks(CenterTicks) ax0.set_xticklabels(ListSubsys[ListInd.astype(int)]) ax0.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-20, length=0, labelrotation=90, size=0, labelsize=10) ax0.grid(False) StartRow = firstratio # first row number of a sub-plot SubClassInd = 0 for SubClassLabel, SubMatCount in sorted(DictMatCount.items()): # iterate through sub-classes NumRow = SubMatCount.shape[0] # the number of row for this sub-class EndRow = StartRow + NumRow # the ending row number # assign a size for the second subplot ax = fig.add_subplot(gs[StartRow:EndRow, :-1]) ax.imshow(SubMatCount, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper', vmin=MatCount.min(), vmax=MatCount.max()) ax.yaxis.set_label_position("right") # adjust y-ticks size ax.tick_params(axis="y", labelsize=10) ax.axis(xmin=-0.5, xmax=round(SubMatCount.shape[-1] * 1.001)-1+0.5) #ax.set_ylabel('sub{}'.format(SubClassLabel), fontsize=10, rotation=0) ax.text(SubMatCount.shape[-1]*1.001, SubMatCount.shape[0]/2., 'sub{}'.format(SubClassLabel), ha='left', va='bottom', rotation=90., fontsize=10) if SubMatCount.shape[0] == 1: ax.set_yticks([0]) else: ax.yaxis.set_major_locator(MaxNLocator(integer=True)) ax.set_xticks([]) # delete ticks # get rig of grid ax.grid(False) SubClassInd += 1 # update for subclass index StartRow = EndRow # update for a next one fig.subplots_adjust(hspace=int(0.005*totalrow)) fig.subplots_adjust(wspace=int(3)) ax.set_xlabel('Auxiliary channel', fontsize=15) fig.text(0.05, 0.5, 'Glitch index', ha='center', va='center', rotation=90., fontsize=15) ###### Make a color bar cmap = matplotlib.cm.viridis norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max()) ax3 = fig.add_subplot(gs[1:, -1]) cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False) ax3.grid(False) cbar.set_label('Importance', size=10) ######## output_path = os.path.join(output_dir, 'GlitchIndexVSChannelClustered.pdf') fig.savefig(output_path) plt.close() #-------------------------------------------------------------------------------+ # make .csv file comprising SubClassLabel, GPS, importance of all the channels | #-------------------------------------------------------------------------------+ # matrix of SubClass Label and corresponding GPS times MatSubClassLabelGPSDuration = np.concatenate((Y_.reshape(Y_.shape[0], 1), ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1)), axis=1) # concatenate subclass labels and a list of GPS times and importance of all the channels MatSubClassLabelGPSDurationImportanceChannels = np.concatenate((MatSubClassLabelGPSDuration, MatCount), axis=1) # make a list of column names columnName = np.append(np.array(['SubclassLabel', 'GPS', 'duration']), ListChannelName) # convert it to a pandas frame dfGPSDurationImportanceChannels = pd.DataFrame(MatSubClassLabelGPSDurationImportanceChannels, columns=columnName) path_output_fileGPSImportanceChannels = os.path.join(output_dir, 'ClusteredGPSDurationImportanceChannels.csv') # save it as .csv file dfGPSDurationImportanceChannels.to_csv(path_output_fileGPSImportanceChannels) print('Completed saving a table comprising a sub-class labels, GPS times, and importance of the channels') #--------------------------------------------+ # plot each sub-class and make a table it | #--------------------------------------------+ for SubClassLabel, SubMatCount in sorted(DictMatCount.items()): # iterate through sub-classes Plotoutput_file = 'SubClass{}.pdf'.format(SubClassLabel) Tableoutput_file = 'SubClass{}.csv'.format(SubClassLabel) # plot self.PlotImportanceVSChannel(SubMatCount, ListChannelName, output_dir, Plotoutput_file) # making a table self.TableImportance(SubMatCount, ListChannelName, output_dir, Tableoutput_file) print('Completed plotting and making a table of sub-class {}'.format(SubClassLabel)) print('Sub-class analysis is done') return None
[docs] def AutoDetermineTrendBin(self, SegmentStart, SegmentEnd): ''' description: automatically determine the bins of the subclass trend plot USAGE: trend = AutoDetermineTrendBin(SegmentStart, SegmentEnd) :param SegmentStart: start time of a segment :param SegmentEnd: end time of a segment :return: trend: {'mins', 'hours', 'days', 'month'} ''' length = SegmentEnd - SegmentStart # segment length in second if length < 3 * 60 * 60: # if the length is shorter than 3 hours trend = 'mins' elif length >= 3 * 60 * 60 and length < 3 * 24 * 60 * 60: # if the length is longer than 3 hours and shorter than 3 days trend = 'hours' elif length >= 3 * 24 * 60 * 60 and length < 3 * 30 * 24 * 60 * 60: # if the length longer than 3 days and shorter than 3 months trend = 'days' elif length >= 3 * 30 * 24 * 60 * 60: # if the length is longer than 3 months trend = 'months' return trend
[docs] def TrendSubClass(self, SegmentStart, SegmentEnd, input_dir, input_file, output_dir, trend='months', norm=False): ''' input file is supposed to be ClusteredGPSImportanceChannels.csv USAGE: TrendSubClass(SegmentStart, SegmentEnd, input_dir, input_file, output_dir, trend='month') :param SegmentStart: start GPS time of a segment :param SegmentEnd: end GPS time of a segment :param input_dir: input directory :param input_file: an input file :param output_dir: an output directory :param trend: a trend 'months', 'days', 'hours' or 'mins' :return: None ''' # load an input file input_path = os.path.join(input_dir, input_file) try: Subclass_df = pd.read_csv(input_path) except IOError: print('There is no sub-class file. Skip the trend plot') return None # set a trend, month or day if trend == 'months': TrendInSecond = float(30 * 24 * 60 * 60) # in second elif trend == 'days': TrendInSecond = float(24 * 60 * 60) # in second elif trend == 'hours': TrendInSecond = float(60 * 60) # in second elif trend == 'mins': TrendInSecond = float(60) # in second # add a column having time from the beginning of a segment in a trend scale Subclass_df = Subclass_df.assign(time=(Subclass_df['GPS'] - SegmentStart) / TrendInSecond) # convert GPS time to time in a trend # a length of this segment in a trend scale SegmentLength = (SegmentEnd - SegmentStart) / TrendInSecond # bins of histogram Bins = int(round(SegmentLength)) # non-duplicate sub-class labels UniqueClass = np.unique(Subclass_df['SubclassLabel']).astype(int) # make a list of each sub-class trend SubClassSeries = [Subclass_df[Subclass_df['SubclassLabel'] == subclass]['time'] for subclass in UniqueClass] LABEL = ['sub{}'.format(i) for i in UniqueClass] # making combinations of colors and linestyles ListColor = ['blue', 'red', 'cyan', 'gray', 'green', 'yellow'] #ListLineStyle = ['-', '--', ':', '-.'] ListLineStyle = ['-'] ColorStyleCombination = list(itrtls.product(ListColor, ListLineStyle)) rd.shuffle(ColorStyleCombination) # make a histogram if norm == False: for index in range(len(LABEL)): Count, edge = np.histogram(SubClassSeries[index], range=(0, SegmentLength), bins=Bins, density=False) # without normalization d_edge = edge[1] - edge[0] Rate = Count/float(d_edge) #plt.plot(edge[:-1], Rate, drawstyle='steps-mid', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], label=LABEL[index]) plt.plot(edge[:-1], Rate, marker='o', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], alpha=0.5, label=LABEL[index]) plt.ylabel('Rate [/{}]'.format(trend.split('s')[0]), fontsize=15) if norm == True: for index in range(len(LABEL)): Count, edge = np.histogram(SubClassSeries[index], range=(0, SegmentLength), bins=Bins, density=True) # without normalization d_edge = edge[1] - edge[0] Rate = Count/float(d_edge) #plt.plot(edge[:-1], Rate, drawstyle='steps', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], label=LABEL[index]) plt.plot(edge[:-1], Rate, marker='o', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], alpha=0.5, label=LABEL[index]) plt.ylabel('Normalized rate [/{}]'.format(trend), fontsize=15) TIME = tconvert(SegmentStart) plt.xlabel('Time [{0}] from {1}-{2:02d}-{3:02d} {4:02d}:{5:02d}:{6:02d} UTC'.format(trend, TIME.year, TIME.month, TIME.day, TIME.hour, TIME.minute, TIME.second), fontsize=15) plt.tick_params(axis="x", labelsize=10) plt.tick_params(axis="y", labelsize=10) plt.legend() if trend == 'months': output_file = 'SubClassTrend_Monthly.pdf' elif trend == 'days': output_file = 'SubClassTrend_Dayly.pdf' elif trend == 'hours': output_file = 'SubClassTrend_Hourly.pdf' elif trend == 'mins': output_file = 'SubClassTrend_minutes.pdf' output_path = os.path.join(output_dir, output_file) plt.savefig(output_path) plt.close() return None
[docs] def make_subset_channel_based_on_samplingrate(self, g, target_sampling_rate): ''' dependencies being upon: PlotIndividualFCS_ImportanceVSChannel() Usage: X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID = make_subset_channel_based_on_samplingrate(f['gps00000'], 256) :param g: target glitch class's group or a file itself (at a GPS time), HDF5 format :param target_sampling_rate: Sampling rate used to group them, (256, 512, 1024, 2048, 4096, 8192, 16384 :return X: matrix comprising some channels with a same samplign rate of a target glitch class at a given time whitened a reference, only duration: duration of time series Listch_label_num: the list of channel labels ListChannel: the list of name of channels GPS: GPS time of this group SNR: SNR of this glitch confidence: confidence level of this glitch ID: GravitySpy uniqu ID ''' GPS = g.attrs['GPS'] # GPS time try: SNR = g.attrs['snr'] # SNR confidence = g.attrs['confidence'] # confidence level try: ID = g.attrs['id'].decode('utf-8') # GravitySpy id except AttributeError: ID = None except KeyError: SNR = 0 confidence = 0 ID = 'None' Listch_label_num = [] # a list having the channel labels ListChannel = [] # a list having the name of channels X = np.array([]) typeSeg = 'T' for ch_label in g.keys(): if str(ch_label)[2] == typeSeg: dset = g[ch_label] sr = dset.attrs['SamplingRate'] duration = dset.attrs['duration'] channel = dset.attrs['channel'].decode('utf-8') ########## remove noisemon channels ############# try: if channel.split('-')[1].split('_')[2] == 'NOISEMON': continue except IndexError: pass ###################### if sr == target_sampling_rate: spT = dset.value # whitened FFT values if np.isnan(spT[0]) == True: spT = np.zeros(spT.shape[0]) pT = spT#**2 # whitened (Amplitude) Power series pT = pT.reshape((1, pT.shape[0])) # reshape the vector suitable to make a matrix # print(white_scale_psd.shape) X = np.vstack([X, pT]) if X.size else pT Listch_label_num.append(int(ch_label.split('ch{}'.format(typeSeg))[1])) ListChannel.append(channel) else: continue return X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID
[docs] def PlotIndividualFCS_ImportanceVSChannel(self, glitchtype, IFO, GravitySpy_df, output_dir, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None', mode='offline', Listsegments=None, re_sfchs=None, Data_outputpath=None, Data_outputfilename=None, PlusHOFT='False', number_process=None): ''' description: 1. load a file comprising all glitches in a class 2. create a plot comprising frequency versus channel & importance versus channel - dependency: self.CreateChannelTicks(ListChannel) self.make_subset_channel_based_on_samplingrate() self.CreateMatCount() self.PlotImportanceVSChannel() 3. save a plot dependencies: make_subset_channel_based_on_samplingrate(), CreateChannelTicks(), CreateMatCount(), PlotImportanceVSChannel() USAGE: PlotIndividualFCS_ImportanceVSChannel(glitchtype, IFO, output_dir, sigma, LowerCutOffFreq, UpperCutOffFreq) :param glitchtype: a type of glitch used for create a name of a plot :param IFO: # a type of IFO used in a name of a plot :param GravitySpy_df: a meta data of Gravity Spy in pandas frame :param output_dir: a output directory :param simga: an integer number used for the upper bound of BG noise :param LowerCutOffFreq: the lower cut-off frequency used in CreateMatCount, 'None' in default :param UpperCutOffFreq: the upper cut-off frequency used in CreateMatCount, 'None' in default :param mode: 'offline' or 'online' :param Listsegments: a list of allowed glitches, which is used for online mode only, None in default :param re_sfchs: a list of safe channels except unused channels, which is used for online mode only, None in default :param Data_outputpath: a directory saving for a HDF5 file, which is used for online mode only, None in default :param Data_outputfilename: a file saving for a HDF5 file, which is used for online mode only, None in default :param PlusHOFT: whether to get data of HOFT {'True', 'False'}, which is used for online mode only, 'False' in default :param number_process: a number of processes in parallel, which is used for online mode only, None in default return None ''' if mode == 'offline': for group_label in list(self.f.keys()): # iterate though all glitches in a class g = self.f[group_label] if len(g.keys()) == 0: # if no dataset in this group, go to a next iteration continue if len(g.keys()) / 3 == int: # if there are missing channels, go to a next iteration continue #++++++++++ determine color upper and lower values +++++++++++++++++ SmallestValue = 1 # dummy global minimum LargestValue = 0 # dummy global maximum for ch_label in g.keys(): # iterate through channel data sets typeSeg = 'T' # only for target data sets if str(ch_label)[2] == typeSeg: # take only target data sets dset = g[ch_label] # get data set localMin = dset.value.min() # find a local minima localMax = dset.value.max() # find a local maxima if localMin <= SmallestValue: SmallestValue = localMin # replace a global minimum with a local minima if LargestValue <= localMax: LargestValue = localMax # replace a global maximum with a local maxima # set figure size fig = plt.figure(figsize=(25, 20)) # set outer grid of this plot Base_outer = GridSpec(2, 50, wspace=0.3, hspace=0.5) outer = GridSpecFromSubplotSpec(1, 7, subplot_spec=Base_outer[:49], wspace=0.3, hspace=0) #++++++++++++ plot frequency vs channels grouped based on their sampling rates ++++++++++++++++++++ for fig_num, target_samplerate in enumerate([256, 512, 1024, 2048, 4096, 8192, 16384][:], 0): # iterate though all the sampling rates # make matrix having channels in row, frequency in column, and list of extra paramters X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID = self.make_subset_channel_based_on_samplingrate(g, target_samplerate) if X.shape[0] != 0: # transpose X (X.T) make this matrix having frequency in row, channel in column, but the frequency increases from the top to the bottom X = X.T[::-1, :] # turn up side down by [::-1,:], to make the frequency values go from the bottom to the top #+++++++++++ plot color indicator for channels on the top +++++++++++++++++++++++++++++ # color the background either with blakc or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 # make grid of an inner plot inner = GridSpecFromSubplotSpec(15, 1, subplot_spec=outer[fig_num], wspace=0.1, hspace=0) # plot the color indicator for channel on the top of this plot ax1 = fig.add_subplot(inner[:1, 0]) # First row, first column CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannel) for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if color_ind == ListInd.shape[-1] - 2: # add a cell for at the end to make a plot nicer ax1.axvspan(edgeLeft, edgeRight + 1, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) else: ax1.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) ax1.axis(xmin=0, xmax=X.shape[-1]) ax1.set_xticks([]) # make ticks disappear ax1.set_yticks([]) # make ticks disappear #++++++++++ make a frequency versus channel plot +++++++++++++++++++ ax2 = fig.add_subplot(inner[1:, 0]) # First row, first column ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, vmin=SmallestValue, vmax=LargestValue, origin='upper') #ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, norm=LogNorm(vmin=SmallestValue, vmax=LargestValue)) # log scale # make frequency ticks to be correct cutoff_freq_left = 0 cutoff_freq_right = target_samplerate / 2 interval = 5 y_tics = np.linspace(cutoff_freq_left, cutoff_freq_right, interval, endpoint=True)[::-1].astype(int) # new ticks ax2.set_yticks(np.linspace(0, X.shape[0] - 1, interval, endpoint=True)) # set y ticks locations ax2.set_yticklabels(y_tics, size=15) # replace y ticks with a frequency values ax2.set_xticks(CenterTicks) # locate x ticks ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) # replace x ticks with channel names ax2.xaxis.set_tick_params(labelbottom=False, labeltop=True, direction='in', pad=3, length=0, labelrotation=90, size=0, labelsize=10) # set x ticks paramters ax2.grid(False) # write y label only for the left pannel as the rest are duplicated if fig_num == 0: plt.ylabel('Frequency [Hz]', fontsize=20) elif X.shape[0] == 0: inner = GridSpecFromSubplotSpec(15, 1, subplot_spec=outer[fig_num], wspace=0.1, hspace=0) ax2 = fig.add_subplot(inner[:, 0]) ax2.set_xticks([]) ax2.set_yticks([]) ax2.grid(False) if fig_num == 0: plt.ylabel('Frequency [Hz]', fontsize=20) ###### Make a color bar inner2 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[49], wspace=0.01, hspace=0.5) cmap = matplotlib.cm.viridis norm = matplotlib.colors.Normalize(vmin=SmallestValue, vmax=LargestValue) ax_bar = fig.add_subplot(inner2[0]) cbar = matplotlib.colorbar.ColorbarBase(ax_bar, cmap=cmap, norm=norm, orientation='vertical', drawedges=False) ax_bar.grid(False) cbar.set_label('Whitened FFT', size=10) # get a URL of Q-transform of h(t) try: URL = GravitySpy_df[GravitySpy_df['id'] == ID]['imgUrl'].tolist()[0] except IndexError: # deal with the one with no Q-transport plot of h(t) URL = 'None' # give an overall title if LowerCutOffFreq == 'None' and UpperCutOffFreq == 'None': plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s\n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, URL), fontsize=20) elif LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}- Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), URL), fontsize=20) elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = -{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(UpperCutOffFreq), URL), fontsize=20) elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None': plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}-{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), int(UpperCutOffFreq), URL), fontsize=20) # x label for the above plot fig.text(0.5, 0.55, 'Auxiliary channel', ha='center', va='center', fontsize=20) #++++++++ plot importance versus channel inner3 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[50:], wspace=0.1, hspace=0.5) ax = fig.add_subplot(inner3[0]) # First row, first column # create a count matrix of this glitch MatCount, ListChannelName, ListSNR, ListConf = self.CreateMatCount(sigma, g, LowerCutOffFreq, UpperCutOffFreq) # actually plot it self.PlotImportanceVSChannel(MatCount, ListChannelName, None, None, ax=ax) output_file = '{}.pdf'.format(str(GPS).replace('.', '-')) output_path = os.path.join(output_dir, output_file) fig.savefig(output_path) plt.close() print('Completed plotting for a(n) {} glitch in {} at {}'.format(glitchtype, IFO, str(GPS))) elif mode == 'online': for g in SaveTargetAndBackGroundHDF5_ONLINE(Listsegments, re_sfchs, IFO, Data_outputpath, Data_outputfilename, number_process, PlusHOFT): # iterate group by group #++++++++++ determine color upper and lower values +++++++++++++++++ SmallestValue = 1 # dummy global minimum LargestValue = 0 # dummy global maximum for ch_label in g.keys(): # iterate through channel data sets typeSeg = 'T' # only for target data sets if str(ch_label)[2] == typeSeg: # take only target data sets dset = g[ch_label] # get data set localMin = dset.value.min() # find a local minima localMax = dset.value.max() # find a local maxima if localMin <= SmallestValue: SmallestValue = localMin # replace a global minimum with a local minima if LargestValue <= localMax: LargestValue = localMax # replace a global maximum with a local maxima # set figure size fig = plt.figure(figsize=(25, 20)) # set outer grid of this plot Base_outer = GridSpec(2, 50, wspace=0.3, hspace=0.5) outer = GridSpecFromSubplotSpec(1, 7, subplot_spec=Base_outer[:49], wspace=0.3, hspace=0) #++++++++++++ plot frequency vs channels grouped based on their sampling rates ++++++++++++++++++++ for fig_num, target_samplerate in enumerate([256, 512, 1024, 2048, 4096, 8192, 16384][:], 0): # iterate though all the sampling rates # make matrix having channels in row, frequency in column, and list of extra paramters X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID = self.make_subset_channel_based_on_samplingrate(g, target_samplerate) # transpose X (X.T) make this matrix having frequency in row, channel in column, but the frequency increases from the top to the bottom X = X.T[::-1, :] # turn up side down by [::-1,:], to make the frequency values go from the bottom to the top #+++++++++++ plot color indicator for channels on the top +++++++++++++++++++++++++++++ # color the background either with blakc or gray for each channel Listcolor = ['black', 'gray'] * 20 * 7 # make grid of an inner plot inner = GridSpecFromSubplotSpec(15, 1, subplot_spec=outer[fig_num], wspace=0.1, hspace=0) # plot the color indicator for channel on the top of this plot ax1 = fig.add_subplot(inner[:1, 0]) # First row, first column CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannel) for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0): if color_ind == ListInd.shape[-1] - 2: # add a cell for at the end to make a plot nicer ax1.axvspan(edgeLeft, edgeRight + 1, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) else: ax1.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None) ax1.axis(xmin=0, xmax=X.shape[-1]) ax1.set_xticks([]) # make ticks disappear ax1.set_yticks([]) # make ticks disappear #++++++++++ make a frequency versus channel plot +++++++++++++++++++ ax2 = fig.add_subplot(inner[1:, 0]) # from the second row, first column ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, vmin=SmallestValue, vmax=LargestValue, origin='upper') #ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, norm=LogNorm(vmin=SmallestValue, vmax=LargestVlaue)) # log scale # make frequency ticks to be correct cutoff_freq_left = 0 cutoff_freq_right = target_samplerate / 2 interval = 5 y_tics = np.linspace(cutoff_freq_left, cutoff_freq_right, interval, endpoint=True)[::-1].astype(int) # new ticks ax2.set_yticks(np.linspace(0, X.shape[0] - 1, interval, endpoint=True)) # set y ticks locations ax2.set_yticklabels(y_tics, size=15) # replace y ticks with a frequency values ax2.set_xticks(CenterTicks) # locate x ticks ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) # replace x ticks with channel names ax2.xaxis.set_tick_params(labelbottom=False, labeltop=True, direction='in', pad=3, length=0, labelrotation=90, size=0, labelsize=10) # set x ticks paramters ax2.grid(False) # write y label only for the left pannel as the rest are duplicated if fig_num == 0: plt.ylabel('Frequency [Hz]', fontsize=20) ###### Make a color bar inner2 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[49], wspace=0.01, hspace=0.5) cmap = matplotlib.cm.viridis norm = matplotlib.colors.Normalize(vmin=SmallestValue, vmax=LargestValue) ax_bar = fig.add_subplot(inner2[0]) cbar = matplotlib.colorbar.ColorbarBase(ax_bar, cmap=cmap, norm=norm, orientation='vertical', drawedges=False) ax_bar.grid(False) cbar.set_label('Whitened FFT', size=10) #### make a title ###### # get a URL of Q-transform of h(t) try: URL = GravitySpy_df[GravitySpy_df['id'] == ID]['imgUrl'].tolist()[0] except IndexError: # deal with the one with no Q-transport plot of h(t) URL = 'None' # give an overall title if LowerCutOffFreq == 'None' and UpperCutOffFreq == 'None': plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s\n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, URL), fontsize=20) elif LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}- Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), URL), fontsize=20) elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = -{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(UpperCutOffFreq), URL), fontsize=20) elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None': plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}-{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), int(UpperCutOffFreq), URL), fontsize=20) # x label for the above plot fig.text(0.5, 0.55, 'Auxiliary channel', ha='center', va='center', fontsize=20) #++++++++ plot importance versus channel inner3 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[50:], wspace=0.1, hspace=0.5) ax = fig.add_subplot(inner3[0]) # First row, first column # create a count matrix of this glitch MatCount, ListChannelName, ListSNR, ListConf = self.CreateMatCount(sigma, g, LowerCutOffFreq, UpperCutOffFreq) # actually plot it self.PlotImportanceVSChannel(MatCount, ListChannelName, None, None, ax=ax) output_file = '{}.pdf'.format(str(GPS).replace('.', '-')) output_path = os.path.join(output_dir, output_file) fig.savefig(output_path) plt.close() print('Completed saving a plot for a(n) {} glitch in {} at {}'.format(glitchtype, IFO, str(GPS))) return None
[docs] def FrequencyBandColor(self, PathGlitchChannel_Low, PathGlitchChannel_Mid, PathGlitchChannel_High): ''' description: 1. load files that have a low, middle, high frequency band importances. (GPSImportanceChannels.csv) 2. make a matrix that has RGB color based on each importance 3. output this matrix USAGE: RGBMat = FrequencyBandColor(PathGlitchChannel_Low, PathGlitchChannel_Mid, PathGlitchChannel_High) :param PathGlitchChannel_Low: a path to a file that has a low frequency band :param PathGlitchChannel_Mid: a path to a file that has a middle frequency band :param PathGlitchChannel_High: a path to a file that has a high frequency band :return: RGBMat ''' ### load data files try: # for low frequency band data dfLow = pd.read_csv(PathGlitchChannel_Low) dfLow = dfLow.drop(['Unnamed: 0', 'GPS'], axis=1) except IOError: print('There is not such a file in {}'.format(PathGlitchChannel_Low)) sys.exit() try: # for mid frequency band data dfMid = pd.read_csv(PathGlitchChannel_Mid) dfMid = dfMid.drop(['Unnamed: 0', 'GPS'], axis=1) except IOError: print('There is not such a file in {}'.format(PathGlitchChannel_Mid)) sys.exit() try: # for high frequency band data dfHigh = pd.read_csv(PathGlitchChannel_High) dfHigh = dfHigh.drop(['Unnamed: 0', 'GPS'], axis=1) except IOError: print('There is not such a file in {}'.format(PathGlitchChannel_High)) sys.exit() # make a matrix with all the elements equal to zero RGBMat = np.zeros((dfLow.values.shape[0], dfLow.values.shape[1], 3)) # fill the each value as RGB colors # low freq is red, middle freq is green, high freq is blue for row_num in range(dfLow.shape[0]): for col_num in range(dfLow.shape[1]): RGBMat[row_num, col_num, 0] = dfLow.values[row_num, col_num] RGBMat[row_num, col_num, 1] = dfMid.values[row_num, col_num] RGBMat[row_num, col_num, 2] = dfHigh.values[row_num, col_num] return RGBMat