Discovering Tag Relationships

Brian Erickson

This is an investigation into finding relationships between tags. Tag relationships can act as leads for further investigation by newsfeed reporters. The key here is to quickly identify candidates with minimal processing and only do expensive groupwise modeling and evaluation on tags that are likely to be strongly related.

General Strategy

Identify similar kinds of tags using metadata and summary information
Identify candidate system relationships between tags using metadata and summary information
Each tag will maintain a list of the most similar tags by kind and system relationships
At this point we each tag has candidates for class and system analysis
Do pairwise evaluation of candidate system relationships to identify if tags truly have predictive relationships
"Crawl" highly correlated system relationships to find more relationships -- If tag_a relates to tag_b and tag_b relates to tag_c, then test to see if tag_a relates to tag_c -- If we find that a tag1 of "Kind1" relates to tag2 of "Kind2", try other combinations of "Kind1, Kind2", focusing on the metadata relationshp of tag1 to tag2. (same numbers, name patterns) -- Crawl tags that are on the same chart and look for system and kind relationships

How This is Useful

Tags in a kind are useful for outlier detection, if most tags in a kind exhibit certain properties, but a small minority don't, these may be interesting stories
Tags in a system are useful for correlative analysis. Stories of the form "usually tag_a and tag_b are in sync, now they aren't" can be generated
When a user views a chart, the system can "suggest" adding or navigating to other tags of the same kind or system

import requests
from requests.auth import HTTPBasicAuth
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import math

# start with empty data fromes
analog_summaries = pd.DataFrame()
tags = pd.DataFrame()

#auth=HTTPBasicAuth('[email protected]', 'W0nderware')
auth=None

# below is bearer for [email protected]
headers={"Authorization":"Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJlbWFpbCI6Im1pZ3VlbC50bjJAb3V0bG9vay5jb20iLCJ0ZW5hbnRpZCI6IjI2ODJiNDA3LWYyMzUtNGQzZC05ZTA2LTI0MjMwOGQ3NGVjNyIsImlzcyI6Imh0dHBzOi8vd29uZGVyd2FyZS5hdXRoMC5jb20vIiwiYXVkIjoicU5hQlRoVXFxRGdKSEhpNnNlSndWaXgzNjJLQkJ2MDAiLCJleHAiOjE0NjE3OTc3ODMsIm5iZiI6MTQ1OTk4MzM4M30.gD3XB75D8xCEX4dOomTHGBybn53QyvZsSGJu1Afk8yU"}
host = "https://devinfoclientapifunnel.azurewebsites.net"
#host = "https://devinfoclient.azurewebsites.net/apis"

def load_data_frames():
    global tags,analog_summaries
    tags=pd.read_pickle('tags.pickle')
    analog_summaries = pd.read_pickle('analog_summaries.pickle')
def save_data_frames():
    analog_summaries.to_pickle('analog_summaries.pickle')
    tags.to_pickle('tags.pickle')

load_data_frames()

def alpha_name(s):
    return ''.join([i for i in s if not i.isdigit()])

alpha_name('Br1an32')

'Bran'

def time_string():
    return time.strftime("%Y-%m-%d %H:%M:%S %z")

# example
time_string()

'2016-04-12 15:07:43 +0000'

def get_all_tags():
    global auth
    odata_uri = (host+"/Historian/V1/Tags?"
        #"$top=10000"
        #"&$filter=Source eq 'Baytown' or Source eq 'Frankfurt'"
        "$select=Description,EngUnit,EngUnitMax,EngUnitMin,FQN,InterpolationType,MessageOn,MessageOff,Source,TagName,TagType"
    )
    r = requests.get(odata_uri,auth=auth,headers=headers)
    o = r.json()
    tags = pd.DataFrame(o['value']).set_index('FQN')
    while o.has_key('odata.nextLink'):
        odata_uri = o['odata.nextLink']
        print 'getting next data from:',odata_uri
        r = requests.get(odata_uri,auth=auth,headers=headers)
        o = r.json()
        tags = tags.append(pd.DataFrame(o['value']).set_index('FQN'))
    return tags

def get_analog_summary(fqn):
    global auth
    odata_uri = (host+"/Historian/V1/AnalogSummary?"
      "$filter=FQN eq '{0}' and StartDateTime ge datetimeoffset'2016-04-01' and EndDateTime le datetimeoffset'2016-04-03'"
      #"&Resolution=86400000"
      "&$select=*"
    ).format(fqn)
    r = requests.get(odata_uri,headers=headers)
    if not r.ok:
        r.raise_for_status()
        s = 'query failed for fqn: {} {}'.format(fqn,r.text)
        print s
        print time_string()
        raise KeyError(s)
    return pd.DataFrame(r.json()['value']).set_index('FQN')

get_analog_summary(fqn='20150114a.TestTag_14')

def get_n_summaries(n, verbose=False):
    global analog_summaries
    i = 0
    for fqn in tags.index:
        if fqn in analog_summaries.index: continue
        try:
            if tags.TagType[fqn] <> 'Analog': continue
        except:
            #print "couldn't get tag type of", fqn
            continue
        i = i + 1
        if i > n:
            break        
        if verbose: print 'processing',fqn
        new_row = get_analog_summary(fqn=fqn)
        analog_summaries = analog_summaries.append(new_row)

tags['TagType'].value_counts()

Analog      71632
Discrete    31300
String       7354
dtype: int64

tags['Source'].value_counts()

KOCOil                   30972
test                     21549
ThirdApp                 17269
FirstApp                 17198
secondapp                17198
BayernOil                 1865
Tier2                     1212
WeatherApp                 677
yhdEfZZYVFoUd9PvhCbs       395
YPAoBLTEiQ6kaach3VrG       373
CPkgmLLDavUpx3vU1ixG       369
MasterApp                  178
WaterDemo                  169
scott                       80
scottdpapi                  75
devinfo_all                 55
Frankfurt                   50
Baytown                     49
20150114d                   40
Crane1                      40
20150114a                   40
ArunDemo                    35
ArunPub2                    35
arunrpe                     35
arunrpe2                    35
arunrpe3                    35
arunrpe1                    35
ArunDemo2                   35
PLPUploader3                32
Canterbury                  28
TestIndexing                27
ICPerfTestSample            15
Power                       13
nchic                       12
shissue                      8
shyamEFG                     7
MSDemo                       7
shyamABC                     7
ShyamDemoSource              7
ShyamTest1                   7
kiran_azuresearchtest        6
OnPremRepl                   5
FromTestApp                  4
SdkTest                      1
migueltest_DS                1
test2                        1
dtype: int64

def print_rows(df,rows):
#    h = pd.get_option('display.height')
    r = pd.get_option('display.max_rows')
#    pd.set_option('display.height', rows)
    pd.set_option('display.max_rows', rows)
    print df
#    pd.set_option('display.height', h)
    pd.set_option('display.max_rows', r)

f = ['EngUnit','EngUnitMax','EngUnitMin']
print_rows(tags.groupby(f).size(),70)

EngUnit  EngUnitMax  EngUnitMin
         0.0         0             38654
#        100.0       0                66
%        10.0        0                48
         34.0        21                3
         35.0        21                2
                     22                1
         36.0        21                2
         37.0        21                1
         100.0       0              1677
         200.0       0                 2
*C       100.0       0                 5
A        5.0         0                 1
         7.0         0                 1
         10.0        0                 1
         23.4        0                 2
...
°C        75          -90              4
          100          0             250
          150          0              21
          200         -30             20
                       0              85
          220          0              18
          350          0              23
°F        100          0              66
          160         -130            23
Â°C       50           0               2
          100          0            3354
Â°C/HOUR  100          0             210
Âµs/cm    2            0               8
          20           0               3
          200          0               1
Length: 602, dtype: int64

tags.Source.value_counts()

KOCOil                   30972
test                     21549
ThirdApp                 17269
FirstApp                 17198
secondapp                17198
BayernOil                 1865
Tier2                     1212
WeatherApp                 677
yhdEfZZYVFoUd9PvhCbs       395
YPAoBLTEiQ6kaach3VrG       373
CPkgmLLDavUpx3vU1ixG       369
MasterApp                  178
WaterDemo                  169
scott                       80
scottdpapi                  75
devinfo_all                 55
Frankfurt                   50
Baytown                     49
20150114d                   40
Crane1                      40
20150114a                   40
ArunDemo                    35
ArunPub2                    35
arunrpe                     35
arunrpe2                    35
arunrpe3                    35
arunrpe1                    35
ArunDemo2                   35
PLPUploader3                32
Canterbury                  28
TestIndexing                27
ICPerfTestSample            15
Power                       13
nchic                       12
shissue                      8
shyamEFG                     7
MSDemo                       7
shyamABC                     7
ShyamDemoSource              7
ShyamTest1                   7
kiran_azuresearchtest        6
OnPremRepl                   5
FromTestApp                  4
SdkTest                      1
migueltest_DS                1
test2                        1
dtype: int64

c = tags[tags.EngUnit=='DegC'].sort(['EngUnitMax','EngUnitMin'])
c

# this prints too much data for the notebook
#tg = tags.groupby(['EngUnit','EngUnitMax','EngUnitMin'])
#for name, group in tg:
#    print(name)
#    print(group)

tag_duplicates = tags.groupby(['TagName']).size()
tag_duplicates[tag_duplicates > 1]
p=plt.hist(tag_duplicates,21)
tag_duplicates.sort()
tag_duplicates.tail(3)

TagName
ReactTemp    18
ProdLevel    18
Auto         20
dtype: int64

tags['AlphaName'] = tags['TagName'].apply(alpha_name)
alpha_duplicates = tags.groupby(['AlphaName']).size()
p=plt.hist(alpha_duplicates[alpha_duplicates>1],500)
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.show()
alpha_duplicates.sort()
alpha_duplicates.tail(3)

AlphaName
PrevDTTotalStops_    6056
DTTotalTime_         7225
DTTotalStops_        7227
dtype: int64

#get_n_summaries(100)
len(analog_summaries)

73082

len(analog_summaries)

73082

n=100
for i in range(0):
    print 'getting {} analog summaries'.format(n)
    get_n_summaries(n)
    print time_string(), "count:", len(analog_summaries)
    save_data_frames()

plt.figure(figsize=(20,5))
c = analog_summaries.Count.astype(float)
print len(c)
p=plt.hist(c[c>1],bins=np.logspace(0,np.log(max(c)),500))
plt.gca().set_yscale("log")
plt.gca().set_xscale("log")
plt.title('tag cohort identification by value count over 2 days')
plt.xlabel('Value Count')
plt.ylabel('Number of Tags')

73082

<matplotlib.text.Text at 0x7fcdd503f750>

plt.figure(figsize=(20,5))
#plt.figure(width=8)
p = plt.hist(
      analog_summaries.Average[#(analog_summaries.Average<10) & (analog_summaries.Average>-10) & 
                               abs(analog_summaries.Average.astype('float'))>0.1],#500)
                               bins=np.logspace(0.01, np.log(avg_max), 500))
plt.gca().set_yscale("log")
plt.gca().set_xscale("log")
plt.title('tag cohort identification by average value count over 2 days')
plt.xlabel('average value')
plt.ylabel('number of tags')

<matplotlib.text.Text at 0x7fcdd595de50>

	Average	Count	EndDateTime	First	FirstDateTime	Integral	Last	LastDateTime	MaxDateTime	Maximum	MinDateTime	Minimum	OpcQuality	PercentGood	[email protected]	StartDateTime	StdDev
FQN
20150114a.TestTag_14	1952	1	2016-04-03T00:00:00Z	1952	2015-01-14T23:35:38.546Z	337305600	1952	2015-01-14T23:35:38.546Z	1601-01-01T00:00:00Z	NaN	1601-01-01T00:00:00Z	NaN	192	100	https://devinfoclientapifunnel.azurewebsites.n...	2016-04-01T00:00:00Z	0

	Description	EngUnit	EngUnitMax	EngUnitMin	InterpolationType	Source	TagName	TagType	AlphaName
FQN
Baytown.R21.Temp	Temperature in reactor R21	DegC	60	10	Linear	Baytown	R21.Temp	Analog	R.Temp
Baytown.R22.Temp	Temperature in reactor R22	DegC	60	10	Linear	Baytown	R22.Temp	Analog	R.Temp
Frankfurt.R21.Temp	Temperature in reactor R21	DegC	60	10	Linear	Frankfurt	R21.Temp	Analog	R.Temp
Frankfurt.R22.Temp	Temperature in reactor R22	DegC	60	10	Linear	Frankfurt	R22.Temp	Analog	R.Temp
Baytown.B100.Temperature	Combined flow/pressure sensor	DegC	210	140	Linear	Baytown	B100.Temperature	Analog	B.Temperature
Frankfurt.B100.Temperature	Combined flow/pressure sensor	DegC	210	140	Linear	Frankfurt	B100.Temperature	Analog	B.Temperature
Frankfurt.Foobar	Test of REST storage API	DegC	5000	-100	Linear	Frankfurt	Foobar	Analog	Foobar