Discovering Tag Relationships
This is an investigation into finding relationships between tags. Tag relationships can act as leads for further investigation by newsfeed reporters. The key here is to quickly identify candidates with minimal processing and only do expensive groupwise modeling and evaluation on tags that are likely to be strongly related.
General Strategy
How This is Useful
import requests
from requests.auth import HTTPBasicAuth
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import math
# start with empty data fromes
analog_summaries = pd.DataFrame()
tags = pd.DataFrame()
#auth=HTTPBasicAuth('[email protected]', 'W0nderware')
auth=None
# below is bearer for [email protected]
headers={"Authorization":"Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJlbWFpbCI6Im1pZ3VlbC50bjJAb3V0bG9vay5jb20iLCJ0ZW5hbnRpZCI6IjI2ODJiNDA3LWYyMzUtNGQzZC05ZTA2LTI0MjMwOGQ3NGVjNyIsImlzcyI6Imh0dHBzOi8vd29uZGVyd2FyZS5hdXRoMC5jb20vIiwiYXVkIjoicU5hQlRoVXFxRGdKSEhpNnNlSndWaXgzNjJLQkJ2MDAiLCJleHAiOjE0NjE3OTc3ODMsIm5iZiI6MTQ1OTk4MzM4M30.gD3XB75D8xCEX4dOomTHGBybn53QyvZsSGJu1Afk8yU"}
host = "https://devinfoclientapifunnel.azurewebsites.net"
#host = "https://devinfoclient.azurewebsites.net/apis"
def load_data_frames():
global tags,analog_summaries
tags=pd.read_pickle('tags.pickle')
analog_summaries = pd.read_pickle('analog_summaries.pickle')
def save_data_frames():
analog_summaries.to_pickle('analog_summaries.pickle')
tags.to_pickle('tags.pickle')
load_data_frames()
def alpha_name(s):
return ''.join([i for i in s if not i.isdigit()])
alpha_name('Br1an32')
def time_string():
return time.strftime("%Y-%m-%d %H:%M:%S %z")
# example
time_string()
def get_all_tags():
global auth
odata_uri = (host+"/Historian/V1/Tags?"
#"$top=10000"
#"&$filter=Source eq 'Baytown' or Source eq 'Frankfurt'"
"$select=Description,EngUnit,EngUnitMax,EngUnitMin,FQN,InterpolationType,MessageOn,MessageOff,Source,TagName,TagType"
)
r = requests.get(odata_uri,auth=auth,headers=headers)
o = r.json()
tags = pd.DataFrame(o['value']).set_index('FQN')
while o.has_key('odata.nextLink'):
odata_uri = o['odata.nextLink']
print 'getting next data from:',odata_uri
r = requests.get(odata_uri,auth=auth,headers=headers)
o = r.json()
tags = tags.append(pd.DataFrame(o['value']).set_index('FQN'))
return tags
def get_analog_summary(fqn):
global auth
odata_uri = (host+"/Historian/V1/AnalogSummary?"
"$filter=FQN eq '{0}' and StartDateTime ge datetimeoffset'2016-04-01' and EndDateTime le datetimeoffset'2016-04-03'"
#"&Resolution=86400000"
"&$select=*"
).format(fqn)
r = requests.get(odata_uri,headers=headers)
if not r.ok:
r.raise_for_status()
s = 'query failed for fqn: {} {}'.format(fqn,r.text)
print s
print time_string()
raise KeyError(s)
return pd.DataFrame(r.json()['value']).set_index('FQN')
get_analog_summary(fqn='20150114a.TestTag_14')
def get_n_summaries(n, verbose=False):
global analog_summaries
i = 0
for fqn in tags.index:
if fqn in analog_summaries.index: continue
try:
if tags.TagType[fqn] <> 'Analog': continue
except:
#print "couldn't get tag type of", fqn
continue
i = i + 1
if i > n:
break
if verbose: print 'processing',fqn
new_row = get_analog_summary(fqn=fqn)
analog_summaries = analog_summaries.append(new_row)
tags['TagType'].value_counts()
tags['Source'].value_counts()
def print_rows(df,rows):
# h = pd.get_option('display.height')
r = pd.get_option('display.max_rows')
# pd.set_option('display.height', rows)
pd.set_option('display.max_rows', rows)
print df
# pd.set_option('display.height', h)
pd.set_option('display.max_rows', r)
f = ['EngUnit','EngUnitMax','EngUnitMin']
print_rows(tags.groupby(f).size(),70)
tags.Source.value_counts()
c = tags[tags.EngUnit=='DegC'].sort(['EngUnitMax','EngUnitMin'])
c
# this prints too much data for the notebook
#tg = tags.groupby(['EngUnit','EngUnitMax','EngUnitMin'])
#for name, group in tg:
# print(name)
# print(group)
tag_duplicates = tags.groupby(['TagName']).size()
tag_duplicates[tag_duplicates > 1]
p=plt.hist(tag_duplicates,21)
tag_duplicates.sort()
tag_duplicates.tail(3)
tags['AlphaName'] = tags['TagName'].apply(alpha_name)
alpha_duplicates = tags.groupby(['AlphaName']).size()
p=plt.hist(alpha_duplicates[alpha_duplicates>1],500)
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.show()
alpha_duplicates.sort()
alpha_duplicates.tail(3)
#get_n_summaries(100)
len(analog_summaries)
len(analog_summaries)
n=100
for i in range(0):
print 'getting {} analog summaries'.format(n)
get_n_summaries(n)
print time_string(), "count:", len(analog_summaries)
save_data_frames()
plt.figure(figsize=(20,5))
c = analog_summaries.Count.astype(float)
print len(c)
p=plt.hist(c[c>1],bins=np.logspace(0,np.log(max(c)),500))
plt.gca().set_yscale("log")
plt.gca().set_xscale("log")
plt.title('tag cohort identification by value count over 2 days')
plt.xlabel('Value Count')
plt.ylabel('Number of Tags')
plt.figure(figsize=(20,5))
#plt.figure(width=8)
p = plt.hist(
analog_summaries.Average[#(analog_summaries.Average<10) & (analog_summaries.Average>-10) &
abs(analog_summaries.Average.astype('float'))>0.1],#500)
bins=np.logspace(0.01, np.log(avg_max), 500))
plt.gca().set_yscale("log")
plt.gca().set_xscale("log")
plt.title('tag cohort identification by average value count over 2 days')
plt.xlabel('average value')
plt.ylabel('number of tags')