CoCalc -- readligo.py

Views: ⁸⁰⁵
1
"""
2
readligo.py
3

4
This module provides tools for reading LIGO data
5
files.  Data along with supporting documentation
6
can be downloaded from the losc web site:
7
https://losc.ligo.org
8
The method getstrain() is the main function for 
9
loading LIGO data.  Some possible use 
10
cases are shown below.
11

12
Example #1: 
13
segList = getsegs(842657792, 842658792, 'H1')
14
for (start, stop) in segList:
15
  strain, meta, dq = getstrain(start, stop, 'H1')
16
  # -- Analysis code here
17
  ...
18

19
This default configuration assumes that the needed LIGO data 
20
files are available in the current working directory or a 
21
subdirectory.  LIGO data between the input GPS times is loaded
22
into STRAIN.  META is a dictionary of gps start, gps stop, and the 
23
sample time.  DQ is a dictionary of data quality flags.
24

25
Example #2
26
segList = SegmentList('H1_segs.txt')
27

28
In Example 2, 'H1_segs.txt' is a segment list downloaded from the
29
LOSC web site using the Timeline application.  This may be used in the same
30
manner as segList in example 1.
31

32
Example #3
33
filelist = FileList(directory='/home/ligodata')
34
segList = getsegs(842657792, 842658792, 'H1', filelist=filelist)
35
for start, stop in segList:
36
  strain, meta, dq = getstrain(start, stop, 'H1', filelist=filelist)
37
  # -- Analysis code here
38

39
In this example, the first command searches the indicated directory and 
40
sub-directories for LIGO data files.  This list of data files is then 
41
used to construct a segment list and load the requested data.  
42

43
-- SEGMENT LISTS --
44

45
Segment lists may be downloaded from the LOSC web site
46
using the Timeline Query Form or constructed directly
47
from the data files.  
48

49
Read in a segment list downloaded from the Timeline 
50
application on the LOSC web site with SegmentList:
51
>> seglist = SegmentList('H1_segs.txt')
52
OR
53
Construct a segment list directly from the LIGO
54
data files with getsegs():
55
>> seglist = getsegs(842657792, 842658792, 'H1', flag='DATA', filelist=None)
56

57
Written by Jonah Kanner
58
2014 
59
"""
60

61
import numpy as np
62
import os
63
import fnmatch
64

65
def read_frame(filename, ifo, readstrain=True):
66
    """
67
    Helper function to read frame files
68
    """
69
    try:
70
        import Fr
71
    except:
72
        from pylal import Fr
73

74
    if ifo is None:
75
        raise TypeError("""To read GWF data, ifo must be 'H1', 'H2', or 'L1'.
76
        def loaddata(filename, ifo=None):""")
77

78
    #-- Read strain channel
79
    strain_name = ifo + ':LOSC-STRAIN'
80
    if readstrain:
81
        sd = Fr.frgetvect(filename, strain_name)    
82
        strain = sd[0]
83
        gpsStart = sd[1] 
84
        ts = sd[3][0]
85
    else:
86
        ts = 1
87
        strain = 0
88
    
89
    #-- Read DQ channel
90
    dq_name = ifo + ':LOSC-DQMASK'
91
    qd = Fr.frgetvect(filename, dq_name)
92
    gpsStart = qd[1]
93
    qmask = np.array(qd[0])
94
    dq_ts = qd[3][0]
95
    shortnameList_wbit = qd[5].split()
96
    shortnameList = [name.split(':')[1] for name in shortnameList_wbit]
97

98
    #-- Read Injection channel
99
    inj_name = ifo + ':LOSC-INJMASK'
100
    injdata = Fr.frgetvect(filename, inj_name)
101
    injmask = injdata[0]
102
    injnamelist_bit = injdata[5].split()
103
    injnamelist     = [name.split(':')[1] for name in injnamelist_bit]
104

105
    return strain, gpsStart, ts, qmask, shortnameList, injmask, injnamelist
106
    
107
def read_hdf5(filename, readstrain=True):
108
    """
109
    Helper function to read HDF5 files
110
    """
111
    import h5py
112
    dataFile = h5py.File(filename, 'r')
113

114
    #-- Read the strain
115
    if readstrain:
116
        strain = dataFile['strain']['Strain'][...]
117
    else:
118
        strain = 0
119

120
    ts = dataFile['strain']['Strain'].attrs['Xspacing']
121
    
122
    #-- Read the DQ information
123
    dqInfo = dataFile['quality']['simple']
124
    qmask = dqInfo['DQmask'][...]
125
    shortnameArray = dqInfo['DQShortnames'].value
126
    shortnameList  = list(shortnameArray)
127
    
128
    # -- Read the INJ information
129
    injInfo = dataFile['quality/injections']
130
    injmask = injInfo['Injmask'][...]
131
    injnameArray = injInfo['InjShortnames'].value
132
    injnameList  = list(injnameArray)
133
    
134
    #-- Read the meta data
135
    meta = dataFile['meta']
136
    gpsStart = meta['GPSstart'].value    
137
    
138
    dataFile.close()
139
    return strain, gpsStart, ts, qmask, shortnameList, injmask, injnameList
140

141
def loaddata(filename, ifo=None, tvec=True, readstrain=True):
142
    """
143
    The input filename should be a LOSC .hdf5 file or a LOSC .gwf
144
    file.  The file type will be determined from the extenstion.  
145
    The detector should be H1, H2, or L1.
146

147
    The return value is: 
148
    STRAIN, TIME, CHANNEL_DICT
149

150
    STRAIN is a vector of strain values
151
    TIME is a vector of time values to match the STRAIN vector
152
         unless the flag tvec=False.  In that case, TIME is a
153
         dictionary of meta values.
154
    CHANNEL_DICT is a dictionary of data quality channels    
155
    """
156

157
    # -- Check for zero length file
158
    if os.stat(filename).st_size == 0:
159
        return None, None, None
160

161
    file_ext = os.path.splitext(filename)[1]    
162
    if (file_ext.upper() == '.GWF'):
163
        strain, gpsStart, ts, qmask, shortnameList, injmask, injnameList = read_frame(filename, ifo, readstrain)
164
    else:
165
        strain, gpsStart, ts, qmask, shortnameList, injmask, injnameList = read_hdf5(filename, readstrain)
166
        
167
    #-- Create the time vector
168
    gpsEnd = gpsStart + len(qmask)
169
    if tvec:
170
        time = np.arange(gpsStart, gpsEnd, ts)
171
    else:
172
        meta = {}
173
        meta['start'] = gpsStart
174
        meta['stop']  = gpsEnd
175
        meta['dt']    = ts
176

177
    #-- Create 1 Hz DQ channel for each DQ and INJ channel
178
    channel_dict = {}  #-- 1 Hz, mask
179
    slice_dict   = {}  #-- sampling freq. of stain, a list of slices
180
    final_one_hz = np.zeros(qmask.shape, dtype='int32')
181
    for flag in shortnameList:
182
        bit = shortnameList.index(flag)
183
        channel_dict[flag] = (qmask >> bit) & 1
184

185
    for flag in injnameList:
186
        bit = injnameList.index(flag)
187
        channel_dict[flag] = (injmask >> bit) & 1
188
       
189
    #-- Calculate the DEFAULT channel
190
    try:
191
        channel_dict['DEFAULT'] = ( channel_dict['DATA'] )
192
    except:
193
        print "Warning: Failed to calculate DEFAULT data quality channel"
194

195
    if tvec:
196
        return strain, time, channel_dict
197
    else:
198
        return strain, meta, channel_dict
199

200

201
def dq2segs(channel, gps_start):
202
    """
203
    This function takes a DQ CHANNEL (as returned by loaddata or getstrain) and 
204
    the GPS_START time of the channel and returns a segment
205
    list.  The DQ Channel is assumed to be a 1 Hz channel.
206

207
    Returns of a list of segment GPS start and stop times.
208
    """
209
    #-- Check if the user input a dictionary
210
    if type(channel) == dict:
211
        try:
212
            channel = channel['DEFAULT']
213
        except:
214
            print "ERROR: Could not find DEFAULT channel in dictionary"
215
            raise
216

217
    #-- Create the segment list
218
    segments = dq_channel_to_seglist(channel, fs=1)
219
    t0 = gps_start
220
    segList = [(int(seg.start+t0), int(seg.stop+t0)) for seg in segments]
221
    return SegmentList(segList)
222
    
223
def dq_channel_to_seglist(channel, fs=4096):
224
    """
225
    WARNING: 
226
    This function is designed to work the output of the low level function
227
    LOADDATA, not the output from the main data loading function GETSTRAIN.
228

229
    Takes a data quality 1 Hz channel, as returned by
230
    loaddata, and returns a segment list.  The segment
231
    list is really a list of slices for the strain 
232
    associated strain vector.  
233

234
    If CHANNEL is a dictionary instead of a single channel,
235
    an attempt is made to return a segment list for the DEFAULT
236
    channel.  
237

238
    Returns a list of slices which can be used directly with the 
239
    strain and time outputs of LOADDATA.
240
    """
241
    #-- Check if the user input a dictionary
242
    if type(channel) == dict:
243
        try:
244
            channel = channel['DEFAULT']
245
        except:
246
            print "ERROR: Could not find DEFAULT channel in dictionary"
247
            raise
248

249
    # -- Create the segment list
250
    condition = channel > 0
251
    boundaries = np.where(np.diff(condition) == True)[0]
252
    # -- Need to +1 due to how np.diff works 
253
    boundaries = boundaries + 1
254
    # if the array "begins" True, we need to complete the first segment
255
    if condition[0]:
256
        boundaries = np.append(0,boundaries)
257
    # if the array "ends" True, we need to complete the last segment
258
    if condition[-1]:
259
        boundaries = np.append(boundaries,len(condition))
260

261
    # -- group the segment boundaries two by two
262
    segments = boundaries.reshape((len(boundaries)/2,2))
263
    # -- Account for sampling frequency and return a slice
264
    segment_list = [slice(start*fs, stop*fs) for (start,stop) in segments]
265
    
266
    return segment_list
267

268
class FileList():
269
    """
270
    Class for lists of LIGO data files.
271
    
272
    When a FileList instance is created, DIRECTORY will 
273
    be searched for LIGO data files.  Sub-directories
274
    will be searched as well.  By default, the current
275
    working directory is searched.  
276
    """
277
    def __init__(self, directory=None, cache=None):
278

279
        # -- Set default directory
280
        if directory is None:
281
            if os.path.isdir('/archive/losc/strain-gwf'):
282
                directory='/archive/losc/strain-gwf'
283
            else:
284
                directory='.'
285

286
        print "Using data directory {0} ...".format(directory)
287
        self.directory = directory
288
        self.cache = cache
289
        if cache is None:
290
            self.list = self.searchdir(directory)
291
        else:
292
            self.readcache()
293

294
    def searchdir(self, directory='.'):
295
        frameList = []
296
        hdfList   = []
297
        for root, dirnames, filenames in os.walk(directory):
298
            for filename in fnmatch.filter(filenames, '*.gwf'):
299
                frameList.append(os.path.join(root, filename))
300
            for filename in fnmatch.filter(filenames, '*.hdf5'):
301
                hdfList.append(os.path.join(root, filename))
302
        return frameList + hdfList
303

304
    def writecache(self, cacheName):
305
        outfile = open(cacheName, 'w')
306
        for file in self.list:
307
            outfile.write(file + '\n')
308
        outfile.close()
309

310
    def readcache(self):
311
        infile = open(self.cache, 'r')
312
        self.list = infile.read().split()
313
        infile.close()
314
    
315
    def findfile(self, gps, ifo):
316
        start_gps = gps - (gps % 4096)
317
        filenamelist = fnmatch.filter(self.list, '*' + '-' + ifo + '*' + '-' + str(start_gps) + '-' + '*')
318
        if len(filenamelist) == 0:
319
            print "WARNING!  No file found for GPS {0} and IFO {1}".format(gps, ifo)
320
            return None
321
        else:
322
            return filenamelist[0]
323
            
324
def getstrain(start, stop, ifo, filelist=None):
325
    """
326
    START should be the starting gps time of the data to be loaded.
327
    STOP  should be the end gps time of the data to be loaded.
328
    IFO should be 'H1', 'H2', or 'L1'.
329
    FILELIST is an optional argument that is a FileList() instance.
330

331
    The return value is (strain, meta, dq)
332
    
333
    STRAIN: The data as a strain time series
334
    META: A dictionary of meta data, especially the start time, stop time, 
335
          and sample time
336
    DQ: A dictionary of the data quality flags
337
    """
338

339
    if filelist is None:
340
        filelist = FileList()
341

342
    # -- Check if this is a science segment
343
    segList = getsegs(start, stop, ifo, flag='DATA', filelist=filelist)
344
    sl = segList.seglist
345
    if (sl[0][0] == start) and (sl[0][1] == stop):
346
        pass
347
    else:
348
        raise TypeError("""Error in getstrain.
349
        Requested times include times where the data file was not found
350
        or instrument not in SCIENCE mode.
351
        Use readligo.getsegs() to construct a segment list.
352
        The science mode segment list for the requested time range is: 
353
        {0}""".format(segList))
354

355
    # -- Construct list of expected file start times
356
    first = start - (start % 4096)
357
    gpsList = np.arange(first, stop, 4096)
358

359
    m_strain = np.array([])
360
    m_dq     = None
361
    # -- Loop over needed files
362
    for time in gpsList:
363
        filename = filelist.findfile(time, ifo)
364
        print "Loading {0}".format(filename)
365

366
        #-- Read in data
367
        strain, meta, dq = loaddata(filename, ifo, tvec=False)
368
        if len(m_strain) == 0:
369
            m_start = meta['start']
370
            dt = meta['dt']
371
        m_stop = meta['stop']
372
        m_strain = np.append(m_strain, strain)
373
        if m_dq is None:
374
            m_dq = dq
375
        else:
376
            for key in dq.keys():
377
                m_dq[key] = np.append(m_dq[key], dq[key])
378

379
    # -- Trim the data
380
    lndx  = np.abs(start - m_start)*(1.0/dt)
381
    rndx = np.abs(stop - m_start)*(1.0/dt)
382
        
383
    m_strain = m_strain[lndx:rndx]
384
    for key in m_dq.keys():
385
        m_dq[key] = m_dq[key][lndx*dt:rndx*dt]
386

387
    meta['start'] = start
388
    meta['stop']  = stop
389
    meta['dt']    = dt
390

391
    return m_strain, meta, m_dq
392

393
class SegmentList():
394
    def __init__(self, filename, numcolumns=3):
395

396
        if type(filename) is str:
397
            if numcolumns == 4:
398
                number, start, stop, duration = np.loadtxt(filename, dtype='int',unpack=True)
399
            elif numcolumns == 2:
400
                start, stop = np.loadtxt(filename, dtype='int',unpack=True)
401
            elif numcolumns == 3:
402
                start, stop, duration = np.loadtxt(filename, dtype='int',unpack=True)
403
            self.seglist = zip(start, stop)
404
        elif type(filename) is list:
405
            self.seglist = filename
406
        else:
407
            raise TypeError("SegmentList() expects the name of a segmentlist file from the LOSC website Timeline")
408

409
    def __repr__(self):
410
        return 'SegmentList( {0} )'.format(self.seglist)
411
    def __iter__(self):
412
        return iter(self.seglist)
413
    def __getitem__(self, key):
414
        return self.seglist[key]
415
       
416
def getsegs(start, stop, ifo, flag='DATA', filelist=None):
417
    """
418
    Method for constructing a segment list from 
419
    LOSC data files.  By default, the method uses
420
    files in the current working directory to 
421
    construct a segment list.  
422

423
    If a FileList is passed in the flag FILELIST,
424
    then those files will be searched for segments
425
    passing the DQ flag passed as the FLAG argument.
426
    """
427

428
    if filelist is None:
429
        filelist = FileList()
430

431
    # -- Construct list of expected file start times
432
    first = start - (start % 4096)
433
    gpsList = np.arange(first, stop, 4096)
434
    m_dq     = None
435
    
436
    # -- Initialize segment list
437
    segList = []
438

439
    # -- Loop over needed files
440
    for time in gpsList:
441
        filename = filelist.findfile(time, ifo)
442

443
        #-- Read in data
444
        if filename is None:
445
            print "WARNING! No file found with GPS start time {0}".format(time)
446
            print "Segment list may contain errors due to missing files."
447
            continue
448
        else:
449
            try:
450
                strain, meta, dq = loaddata(filename, ifo, tvec=False, readstrain=False)     
451
            except:
452
                print "WARNING! Failed to load file {0}".format(filename)
453
                print "Segment list may contain errors due to corrupt files."
454
                continue
455

456
        if dq is None:
457
            print "Warning! Found zero length file {0}".format(filename)
458
            print "Segment list may contain errors."
459
            continue
460

461
        #-- Add segments to list on a file-by-file basis
462
        chan = dq[flag]
463
        indxlist = dq_channel_to_seglist(chan, fs=1.0)
464
        i_start = meta['start']
465
        i_seglist = [(indx.start+i_start, indx.stop+i_start) for indx in indxlist]
466
        i_seglist = [(int(begin), int(end)) for begin, end in i_seglist] 
467
        segList = segList + i_seglist
468
      
469
    # -- Sort segments
470
    segList.sort()
471
    
472
    # -- Merge overlapping segments
473
    for i in range(0, len(segList)-1):
474
        seg1 = segList[i]
475
        seg2 = segList[i+1]
476
    
477
        if seg1[1] == seg2[0]:
478
            segList[i]   = None
479
            segList[i+1] = (seg1[0], seg2[1])            
480
    # -- Remove placeholder segments
481
    segList = [seg for seg in segList if seg is not None]
482

483
    # -- Trim segment list to fit within requested start/stop times
484
    for seg in segList:
485
        idx = segList.index(seg)
486
        if (seg[1] < start):
487
            segList[idx] = None
488
        elif (seg[0] > stop):
489
            segList[idx] = None
490
        elif (seg[0] < start) and (seg[1] > stop):
491
            segList[idx] = (start, stop)
492
        elif (seg[0] < start):
493
            segList[idx] = (start, seg[1])
494
        elif (seg[1] > stop):
495
            segList[idx] = (seg[0], stop)
496
    # -- Remove placeholder segments
497
    segList = [seg for seg in segList if seg is not None]
498

499
    return SegmentList(segList)
500

501

502

503