Description: An XML gleaner for collecting MN, WI, and IA METAR weather data.
Written by: Jim Miller

An example of an XML report from a single station.

Here is an example of a single-site query:

And the XML report:
<response version="1.2" xsi:noNamespaceSchemaLocation="http://aviationweather.gov/adds/schema/metar1_2.xsd">
    <data_source name="metars"/>
    <request type="retrieve"/>
    <data num_results="1">
                KMKT 292235Z AUTO 03013G19KT 10SM CLR 19/M02 A2998 RMK AO1
            <sky_condition sky_cover="CLR"/>


Import supporting modules
# System and OS
import sys, os

# Strings
import string

# HTML fetching
import requests
from urllib2 import urlopen

# Windows extensions (pywin32-220.win32-py2.7.exe)
import win32com.client
import pywintypes  # supports exceptions names

# time and date functionality
import time
import datetime

# In support of XML parsing
from xml.dom.minidom import parse 


def timeA_GT_global(timeA):
    # Check to see if A is later than what's in the global. 
    # and replace the global if it is...

    # Must declare locally to change a global. If no global declaration, the
    # ASSIGNMENT statement will force this variable to be local here in this
    # function.
    global time_datetime_maxDLSdate

    if (timeA > time_datetime_maxDLSdate):
        time_datetime_maxDLSdate = timeA

def nZS( string):
    # When writing to the spreadsheet, zero-length strings should be represented as None 
    if (string == ""):
        value = None
        value = string
    return value
def utc( timeTuple):
    timeString = "%d/%d/%d %d:%d:%d" % (timeTuple[1], timeTuple[2], timeTuple[0], timeTuple[3], timeTuple[4], timeTuple[5])
    return timeString

def processMultipleStations_xml(station_dictionary):

    # Build the URL string to get data from Multiple stations by making one query to the server.
    # (returned on one page).

    # Single station query for KMKT
    # https://aviationweather.gov/cgi-bin/data/dataserver.php?dataSource=metars&requestType=retrieve&format=xml&mostrecentforeachstation=constraint&hoursBeforeNow=2&stationString=KMKT
    # https://aviationweather.gov/cgi-bin/data/dataserver.php (current server)
    # https://aviationweather.gov/adds/dataserver_current/httpparam (old)
    url_base = ("https://aviationweather.gov/cgi-bin/data/dataserver.php?" + 
                    "dataSource=metars&" + 
                    "requestType=retrieve&" +
                    "format=xml&" +
                    "mostrecentforeachstation=constraint&" + 
                    "hoursBeforeNow=2&" +

    station_names = ""
    station_count = 0
    for station_name in station_dictionary.keys():
        station_count = station_count + 1
        station_names = station_names + station_name + ","  
    station_names = station_names[:-1] # remove the last comma 
    webpage_url = url_base + station_names

        # Fetch the XML page
        page = urlopen(webpage_url)
        message_str = "Error opening url ::: %s ==> %s" % (sys.exc_type, sys.exc_value) 
        enterInLog( message_str)
        print message_str
        # Stop here.
        return ''

        # Parse the XML
        dom_object = parse(page)
        message_str = "Error parsing XML ::: %s ==> %s" % (sys.exc_type, sys.exc_value) 
        enterInLog( message_str)
        print message_str
        # Stop here.
        return ''

    rowsForSpreadsheet = []

    # Check for error and warning messages in the XML. If found, write to log
    # and exit subroutine.

    message_error = getXMLvalueFirstElement(dom_object, "error")    
    message_warning = getXMLvalueFirstElement(dom_object, "warning")     

    if (message_error <> '') or (message_warning <> ''):
        # Send warnings to log file (and should also email a message).
        message_str = "Warning = %s \nError = %s" % (message_warning, message_error) 
        enterInLog( message_str) 
        print message_str

        # If there is an error (i.e., no data to work with), exit this function. 
        if (message_error <> ''): 
            return ''

    # Check an attribute in the XML to find the number of sites that return
    # data.
    n_data = int(dom_object.getElementsByTagName("data")[0].attributes["num_results"].value)

    # Write data for each station
    write_count = 0
    for data_index in range(n_data):  # range of 0 to n_data-1
            # Develop SQL statement
            sqlForStation = buildSQL_xml(dom_object, data_index) 

            # Run the SQL (send formal SQL string and info list). If a
            # successful write is indicated, bump up the counter.
            if (runSQL( sqlForStation[0], sqlForStation[1])):
                write_count = write_count + 1
                rowsForSpreadsheet.append( sqlForStation[2])
            # Note the backslash is a line-continuation character.
            message_str = "general error in Station loop ::: %s ==> %s, \nStation name = %s" \
                          % (sys.exc_type, sys.exc_value, 
                             getXMLvalueInStationGroup(dom_object, data_index, "station_id")) 
            print message_str

    # If there was a successful write to database, make entry in log showing
    # the number of successful gleans from the web site and the number of
    # successful writes to the database.
    # W = Writes
    # G = Gleans
    # P = Possible (number of stations collecting from)
    if (write_count > 0):
        message_str = "New data added: %sW %sG %sP" % (write_count, n_data, station_count) 
        enterInLog( message_str) 
        write_to_spreadsheet( rowsForSpreadsheet)
        print ""
        print "url = %s" % (webpage_url)

    print "%s successful writes from %s of %s stations returning data." % (write_count, n_data, station_count)
    #print "station names = ", station_names

def write_to_spreadsheet( data):
    sheet_url = "https://script.google.com/macros/s/AKfycbxEURXa6nwLdmivY6LNVqNvGs_ltg0E7OhncYDT45GZ8wn1v1SA9oruY6Iidy-pWBF08w/exec"
    postDict = {"sheetName":"aw", "weatherData":data}
        # Send with POST. Note: the postDict dictionary gets converted to a JSON string.
        jsonRequest = requests.post( sheet_url, json=postDict)
        message_str = "Error opening URL."
        print message_str + ", URL = " + sheet_url

def getXMLvalueFirstElement(dom_object, itemName):

    # This is the simplest approach. Just grab the first element value where
    # the element name matches the itemName target.

        value = dom_object.getElementsByTagName(itemName)[0].childNodes[0].data
        value = ''
    return value

def getXMLvalueInStationGroup(dom_object, data_index, itemName):

    # Use the data_index to access elements in the nth hunk of site data in the
    # dom object. (Catch the error if what we are looking for is NOT in the
    # XML tree.)

        # The following approach avoids the problem associated with the Wind
        # Gust data. Namely that it's not always present in each metar group.
        # So if you search directly for wind gust data using the
        # getElementsByTagName function and index, data associations will get
        # confused. This approach is much better in that you only search for
        # wind gust data from within an individual metar group.

        nth_metar_group = dom_object.getElementsByTagName("METAR")[data_index]
        value = nth_metar_group.getElementsByTagName(itemName)[0].childNodes[0].data
        value = ''
    return value

def cent_to_far(temp_c):
    if temp_c == '':
        t_far = '' 
        t_far = round(((9.0/5.0) * float(temp_c)) + 32.0, 1)
    return t_far 

def buildSQL_xml(dom_object, data_index):

    # Read the station name in the nth hunk of data.
    station_name = getXMLvalueInStationGroup(dom_object, data_index, "station_id")
    print "\nStation = ", station_name
    # Use the dictionary to look up the station number.
    station_number = station_dic[station_name]['ID']

    # Time (some necessary labor)

    # Get the ISO time string from the XML. This time stamp, in the XML, is not a local time,
    # but rather is in UTC time (formally known as Greenwich Mean time, GMT). In other words
    # this is not the local time of the weather station recording the data.
    time_ISO_UTC = getXMLvalueInStationGroup(dom_object, data_index, "observation_time")
    #print "time_ISO_UTC=", time_ISO_UTC
    # Parse the time string, which is in an ISO format, into a tuple. First, slice off the 
    # trailing "Z" in the time string. Note that the Z is for Zulu time, or also known as UTC time.
    # (year, month, day, hour, minute, second, weekday, yearday, daylightSavingAdjustment) 
    time_tuple_UTC = time.strptime(time_ISO_UTC[:-1], "%Y-%m-%dT%H:%M:%S")
    print "time_tuple_UTC =", time_tuple_UTC
    # The Unix epoch (or Unix time or POSIX time or Unix timestamp) is the number of seconds 
    # that have elapsed since January 1, 1970 (midnight UTC/GMT), not counting leap seconds 
    # (in ISO 8601: 1970-01-01T00:00:00Z).
    # Convert to seconds since the epoch. First it is important to know that 
    # mktime expects a local time, not a UTC time, and we have gotten a UTC 
    # time from the XML. So to correct for that, we must subtract off the 
    # timezones (in seconds) to correctly form the epoch time. Time.timezone 
    # is the offset of the local (non-DST) timezone, in seconds west of UTC. I 
    # know this seems ugly, but this is the simplest way to convert from UTC 
    # to epoch.
    # BUT, seems to me that this isn't a true EPOCH value. When I past the KMKT
    # value into an EPOCH to time converter it gives a value that is one hour behind
    # in the summer. Something is not right here...
    time_epoch = time.mktime(time_tuple_UTC) - time.timezone 
    #print "time_epoch =", time_epoch 

    # Side Note: This localtime function (using the time module) properly
    # determines if there is daylight savings time or not. You can use the
    # last value in the tuple to establish if DLS. As of March 2007, the times
    # in the database will be in local (to the weather station) standard time.

    time_tuple_gleaner = time.localtime(float(time_epoch)) 
    #print "time_tuple_gleaner =", time_tuple_gleaner

    # This ASCII version might be handy for printing. Not currently used
    # anywhere else.
    time_tuple_gleaner_ascii = time.asctime(time_tuple_gleaner)
    #print "time_tuple_gleaner_ascii =", time_tuple_gleaner_ascii

    # Create a datetime version of the time object. This will be in a timezone local
    # to the computer that is running this code (the gleaner). Apparently this always
    # returns a local (gleaner) standard time.
    time_datetime_gleaner_std = datetime.datetime.fromtimestamp(float(time_epoch)) 
    #print "time_datetime_gleaner_std =", time_datetime_gleaner_std
    # Check for daylight savings time and properly generate a DLS version of
    # the local time. That is, spring forward if during DLS. This is needed
    # for updating the daysGleaned table in the database. Otherwise data
    # gleaned during the first hour after midnight will not trigger a new day
    # in the daysGleaned table (because standard time doesn't consider this
    # data a new day). And the website won't let you get at the new data
    # because there is no corresponding date in the "days" control. Confusing;
    # I know. This little juggle must be done because the web site plots in DLS
    # and the data is stored in the database as standard time. 

    # One issue here is that you have to use the gleaner time zone when doing this.
    # Probably would be better if used the native time zone.
    # If the last part of this tuple is equal to 1, then it's daylightsavings time. 
    if (time_tuple_gleaner[-1] == 1):
        timeshift = datetime.timedelta(hours=1)
        time_datetime_gleaner_dls = time_datetime_gleaner_std + timeshift 
        time_datetime_gleaner_dls = time_datetime_gleaner_std
    #print "time_datetime_gleaner_dls =", time_datetime_gleaner_dls     
    # Shift the gleaner times to the native time zone (the local time zone where the sensor is).
    TZ_shift_net = station_dic[station_name]['TZS_MN'] - TZShift_gleanermoved
    time_datetime_native_std = time_datetime_gleaner_std + datetime.timedelta(hours=TZ_shift_net)
    time_datetime_native_dls = time_datetime_gleaner_dls + datetime.timedelta(hours=TZ_shift_net)
    #print "time_datetime_native_std =", time_datetime_native_std
    # Update the global variable that keeps the latest DLS datetime value (a
    # datetime object).
    # Here is a way to create a time tuple from a datetime object. TLS is just
    # a short name that is handy in the string-builder code below.
    tls = time_datetime_native_std.timetuple()
    #print "tls =", tls

    # Build date strings that will work with the database operation (insert):
    datetime_string = "%d/%d/%d %d:%d:%d" % (tls[1], tls[2], tls[0], tls[3], tls[4], 0) 
    MDY_string = "%d/%d/%d"               % (tls[1], tls[2], tls[0]                   ) 
    Hr_string = "%d"                      % (                        tls[3]           ) 
    Min_string = "%d"                     % (                                tls[4]   ) 
    # print "Datetime string = ", datetime_string

    # Get the rest of the data
    wind_dir_degrees = getXMLvalueInStationGroup(dom_object, data_index, "wind_dir_degrees")
    knots_to_mph = 1.15078030303 
    wind_speed_kt = getXMLvalueInStationGroup(dom_object, data_index, "wind_speed_kt")
    if (wind_speed_kt == ''):
        wind_speed_mph = ''
        wind_dir_degrees = ''
        wind_speed_mph = round(float(wind_speed_kt) * knots_to_mph, 0)
        if (wind_speed_mph == 0.0):
            wind_dir_degrees = ''

    if wind_dir_degrees <> '':
        if (float(wind_dir_degrees) < 0.0):
            wind_dir_degrees = ''
    wind_gust_kt = getXMLvalueInStationGroup(dom_object, data_index, "wind_gust_kt")
    if (wind_gust_kt == ''): 
        wind_gust_mph = wind_speed_mph 
        wind_gust_mph = round(float(wind_gust_kt) * knots_to_mph, 0)
    temp_c = getXMLvalueInStationGroup(dom_object, data_index, "temp_c")
    temp_f = cent_to_far(temp_c)
    dewpoint_c = getXMLvalueInStationGroup(dom_object, data_index, "dewpoint_c")
    dewpoint_f = cent_to_far(dewpoint_c) 
    altim_in_hg = getXMLvalueInStationGroup(dom_object, data_index, "altim_in_hg")

    # Construct the string: this block of code loops through the values
    # building up the SQL string as long as there are no empty string values.
    # This is one way to insert NULL values into the database (i.e. just
    # take them completely out of the SQL string)

    sql_data_raw = ( 
        str(time_epoch), datetime_string, time_ISO_UTC, MDY_string, Hr_string, Min_string,
        str(station_number), station_name, str(wind_dir_degrees), str(wind_speed_mph), str(wind_gust_mph),
        str(temp_f), str(dewpoint_f), str(altim_in_hg) )
    sql_names_raw = (
        'PerlTime', 'DateTimeStamp', 'LiteralDateTimeStamp', 'TimeMDY', 'TimeHr', 'TimeMin', 
        'StationNumber', 'StationName', 'WindDirection', 'WindSpeed', 'WindGust', 
        'TempAvg', 'DewPoint', 'Pressure') 
    #newSpreadSheetRow = [stationName, utc( timestamp_literal), nN( temp_f), nN( dewPoint_f), nN( windDirection_deg), nN( windSpeed_mph), nN( windGust_mph), nN( pressure_inHg)]
    spreadSheetRow = [station_name, utc( time_tuple_UTC), nZS( temp_f), nZS( dewpoint_f), nZS( wind_dir_degrees), nZS( wind_speed_mph), nZS( wind_gust_mph), nZS( altim_in_hg)]

    sql_names = "INSERT INTO FifteenMinData ("
    sql_format = "VALUES ("
    sql_data = () # Empty tuple
    index = 0
    for data_value in sql_data_raw:
        if data_value <> '':
            sql_names = sql_names + sql_names_raw[index] + ", "
            # Add a value to the tuple; note a single value tuple must be represented with a comma..
            sql_data = sql_data + (data_value,)
            # Note: best practice would be to not quote numeric values, quote only strings and dates.
            sql_format = sql_format + "'%s'," 
        index = index + 1
    # remove the last comma and append a ")" 
    sql_format = sql_format[:-1] + ")"
    sql_names = sql_names[:-2] + ") " 

    # Nice to see this at the console (when manually running).
    print sql_data
    # Apply the formatting operation. Note this sql_data MUST be a tuple for
    # this formatting approach to work.
    sql_data_formatted = sql_format % sql_data  

    sql_string = sql_names + sql_data_formatted
    print "SQL string = ", sql_string

    # Return SQL string and data list (to be used with error messages)
    return (sql_string, sql_data, spreadSheetRow)

def runSQL(sql_string, row_ascii):

    # row_ascii: Contains the variable names and is useful in constructing error messages.

    successful_execution = False 

    # Execute SQL string
        successful_execution = True 
        # For testing
        #print "SQL = ", sql_string
        #print "row_ascii = ", row_ascii

    except pywintypes.com_error:   # catch this specific error
        # If only a "duplicate data" error, don't write to the log file
        if (string.find(str(sys.exc_value), "duplicate data") <> -1):
            print "Data already in database (sql warning)."
            message_str = "SQL error ::: %s ==> %s, \nData value = %s" % (sys.exc_type, sys.exc_value, row_ascii) 
            print message_str

        message_str = "general error ::: %s ==> %s, \nData value = %s" % (sys.exc_type, sys.exc_value, row_ascii) 
        print message_str

    else:  # Run this block if no errors...   
        print "Successful SQL execution!"

    # # For testing, clean out any new records for this date.
    # sql = "DELETE * FROM [FifteenMinData] WHERE ([TimeMDY] = #6/29/2006#)"
    # database_conn.Execute(sql)

def enterInLog( logentry):
    logFile.write('==============' + str(datetime.datetime.today()) + '====V ' + version_number + '\n')
    logFile.write( logentry + '\n')

def openConnections():

    # Open logFile
    global logFile

    # The getcwd os function returns the current working directory. Another
    # similar command is os.path.abspath(""). This returns "C:\\waconia" on
    # Physics2 and "C:\\code\\python\\scraper" at here at home. Note that if
    # this is running as a system AT job, this will return something like
    # "C:\\winnt\\system32"

    logFileDir = os.getcwd()
    # If it's not at home then it must be running on Physics2 
    if (logFileDir <> "C:\\code\\python\\scraper"):
        logFileDir = "C:\\Users\\Jim\\Documents\\webcontent\\waconia"
    logFileName = "ws_log.txt"
    logFilePath = logFileDir + "\\" + logFileName 

    # Check for the file and create a new one if none is found.
    # Note: os.curdir doesn't work as a way to find the current directory when
    # if this task is scheduled. So I use the absolute paths above.
    if logFileName not in os.listdir(logFileDir):   
            logFile = open(logFilePath, 'w')  # create and write 
            sys.exit("Error when opening log file. Script stopped!!!")  # Shutdown if no way to log...

        # Header in file.
        logFile.write('==============VERSION '+ version_number +'========================' + '\n')
        logFile.write('File created: ' + str(datetime.datetime.today()) + '\n')
        logFile.write('=================================================' + '\n')
        logFile.write('=================================================' + '\n')
        logFile.write('=================================================' + '\n')
        logFile = open(logFilePath, 'a')  # append

    # Open Database connection
    global database_conn
    database_conn = win32com.client.Dispatch(r'ADODB.Connection')

    # Set path to the database. 
    DSN = 'PROVIDER=Microsoft.Jet.OLEDB.4.0;DATA SOURCE=C:/Users/Jim/Documents/webcontent/waconia/data/telem.mdb;'

        enterInLog("Error ::: %s ==> %s" % (sys.exc_type, sys.exc_value))
        # Close logFile on the way out.
        sys.exit("Error when opening database connection. Script stopped!!!")

def closeConnections():
    database_conn.Close()  # Notice the CAPITAL C
    logFile.close()        # Notice the lowercase c

def attemptWriteToDaysGleaned():

    # This serves to keep the DaysGleaned table up to date. This table is used
    # to populate the "Days" selection control on the web site.
    # If this gleaner is the first to cross into a new day (which it should since it looks at
    # central time zone sources), this should produce a successful write.

    # Make a local tuple out of the global time_datetime_maxDLSdate
    dT = time_datetime_maxDLSdate.timetuple()

    sql_string = "INSERT INTO DaysGleaned ([TimeMDY]) VALUES ('%s/%s/%s')" % (dT[1],dT[2],dT[0])
        # If not a "duplicate data" error, write to the log file.
        if (string.find(str(sys.exc_value), "duplicate data") == -1):  # can't find it in string
            enterInLog("Error ::: %s ==> %s" % (sys.exc_type, sys.exc_value))

Main program

# global variables (any variable in Main that get assigned)

# This global is used for triggering a new day in the database's daysgleaned table.
time_datetime_maxDLSdate = datetime.datetime(2001,1,1) # initialize to some old date

# This also gets written to the log file.
version_number = "1.94"
print "Version Number = ", version_number

# Prepare to write to database and log file.

# This global is used to shift the time-zone shifts if the gleaner is ever
# moved from MN.
TZShift_gleanermoved = 0

# The airport site name/number dictionary
station_dic = {
   # Sites near Saint Peter.                  
   'KMKT':{'ID':'101','TZS_MN':0},  # Mankato
   'KMML':{'ID':'102','TZS_MN':0},  # Marshall
   'KFRM':{'ID':'103','TZS_MN':0},  # Fairmont
   'KHCD':{'ID':'104','TZS_MN':0},  # Hutchinson
   'KULM':{'ID':'105','TZS_MN':0},  # New Ulm
   'KFBL':{'ID':'106','TZS_MN':0},  # Faribault
   'KAEL':{'ID':'107','TZS_MN':0},  # Albert Lea
   'KOWA':{'ID':'108','TZS_MN':0},  # Owatonna

   # Sites near Waconia.                                                       
   'KLVN':{'ID':'110','TZS_MN':0},  # Lakeville
   'KGYL':{'ID':'111','TZS_MN':0},  # Glencoe
   'KLJF':{'ID':'112','TZS_MN':0},  # Litchfield
   'KBDH':{'ID':'113','TZS_MN':0},  # Willmar
   'KPEX':{'ID':'114','TZS_MN':0},  # Paynesville

   # Sites near Worthington.
   'KOTG':{'ID':'120','TZS_MN':0},  # Worthington
   'KSPW':{'ID':'121','TZS_MN':0},  # Spencer, IA
   # Sites near Mille Lacs (Aitkin).
   'KAIT':{'ID':'130','TZS_MN':0},  # Aitkin
   'KJMR':{'ID':'131','TZS_MN':0},  # Mora
   # Sites in general.
   'KFGN':{'ID':'140','TZS_MN':0},  # Flag Island
   'KDYT':{'ID':'141','TZS_MN':0},  # Duluth
   'KLSE':{'ID':'142','TZS_MN':0},  # La Cross
   'KONA':{'ID':'143','TZS_MN':0},  # Winona
   'KRGK':{'ID':'144','TZS_MN':0},  # Red Wing
   'KCFE':{'ID':'145','TZS_MN':0},  # Buffalo Muni
   'KAXN':{'ID':'146','TZS_MN':0},  # Alexandria
   'KFFM':{'ID':'147','TZS_MN':0},  # Fergus Falls
   'KADC':{'ID':'148','TZS_MN':0},  # Wadena Muni
   'KGHW':{'ID':'149','TZS_MN':0},  # Glenwood
   'KBRD':{'ID':'150','TZS_MN':0},  # Brainerd
   'KLXL':{'ID':'151','TZS_MN':0},  # Little Falls
   # Sites near White Bear and Saint Croix River.
   'KRNH':{'ID':'160','TZS_MN':0},  # New Richmond
   'KANE':{'ID':'161','TZS_MN':0},  # Blaine
   'KSTP':{'ID':'162','TZS_MN':0},  # Saint Paul
   # Fritz's sites on the cape...
   'KCQX':{'ID':'163','TZS_MN':1},  # Chatham, MA
   # Alaska...
   'PABR':{'ID':'200','TZS_MN':-3}, # Barrow, AK
   # Hatteras
   'KHSE':{'ID':'205','TZS_MN': 1}, # Cape Hatteras, NC

   # Pasco,WA airport...
   'KPSC':{'ID':'165','TZS_MN':-2}, # Pasco, WA 

   # Antarctica
   'NZSP':{'ID':'380','TZS_MN':18},  # AMUNDSEN-SCOTT

   # Japan
   'RJTT':{'ID':'385','TZS_MN':15},  # TOKYO INTL AIRPO 
   # Near Bob Douglas in GA
   'KSSI':{'ID':'386','TZS_MN': 1}   # GA BRUNSWICK

# Fetch one page, then parse and write to database once for each site 

# Run cleanup query
nDaysBack = datetime.date.today() - datetime.timedelta(days=+366)
nDB = nDaysBack.timetuple()
time_string = str(nDB[0]) + "/" + str(nDB[1]) + "/" + str(nDB[2])
print "\nSQL initiated to remove old records from two tables."
runSQL("DELETE * FROM [FifteenMinData] WHERE ([TimeMDY] < #" + time_string + "#)", "")
runSQL("DELETE * FROM [DaysGleaned] WHERE ([TimeMDY] < #" + time_string + "#)", "")

# Close connections to database and log file.