Skip to main content
cap program drop classifyNeighbors 
program define classifyNeighbors

    args coordinatesFile /// 
         longitudeVar /// 
         latitudeVar /// 
         coordinatesKEEP_STRINGS /// 
         distanceFile /// 
         distanceUnit /// 
         distanceVar ///
         distanceKEEP_STRINGS /// 
         areaFile /// 
         areaUnit /// 
         areaVar /// 
         areaKEEP_STRINGS /// 
         neighbroingLinesFile /// 
         rootDir /// 
         PATH_GIS /// 
         geoid /// 
         idVar /// 
         maxNeighborDegree 
    
    ***********************IMPORTANT NOTE ABOUT FILES***************************
    * The files used for the coordinates, area, distance, and neighboring lines 
    * are allowed to be different files, although they do not have to be, but it 
    * is crucial that they all are related to some parent file such that the 
    * variable "objectid" which is assigned by ArcGIS is the same for all the 
    * spatial units. This means that you cannot use a file with the coordinates 
    * of the centroids of the spatial units that originated from a different 
    * shapefile, and clearly not from a file that does not even contain the 
    * "objectid" variable. 
    ****************************************************************************
    * Arguments Legend 
    * coordinatesFile: file which stores the coordinates of each centroid per 
    * each spatial unit. 
    * longitudeVar: variable with the longitude coordinate 
    * latitudeVar: variable with the latitude coordinate
    * distanceFile: The file which stores the point distance, centroid to  
    * centroid, between your spatial units. 
    * distanceUnit: meters, km, feet, miles, etc.
    * distanceVar: variable name for the distance variable
    * areaFile: The file stores the total area of your spatial unit. Could be 
    * the same file as the distanceFile, but ideally you want to use different 
    * projections (distance preserving, area preserving, etc.) to reduce the 
    * error in each variable. 
    * areaUnit: squared km, squared feet, etc. 
    * areaVar: variable name for the area variable 
    * neighbroingLinesFile: This file stores the data from the XXXXXX tool, from 
    * ArcGIS. 
    * rootDir: path to your library.
    * geoid: variable that identifies the spatial unit in ArcGIS 
    * idVar: variable that identifies the spatial unit from hereafter
    * maxNeighborDegree: how many neighboring degrees should be classified. By 
    * choosing 1 the program will classify only the first degree neighbors, by 
    * choosing 2 it will also classify the neighbor of the neighbor (2nd degree 
    * neighbor), by choosing 3 it will classify the neighbor of the neighbor of 
    * the neighbor (3rd degree neighbor), and so on. 
    * CAUTION: Choosing higher maximum degrees results in very long runtimes. 
    * This program is not very efficient and could take a lot of time (and 
    * require a lot of RAM) to process high degree of neighboring relationship. 

    tempfile coordinatesTable /// 
             distanceTable ///
             areaTable /// 
             neighboringCounties



    * Get coordinates for each county centroid 
    import delimited "`rootDir'/`PATH_GIS'/`coordinatesFile'", /// 
           clear stringcols(`coordinatesKEEP_STRINGS') 

    rename `geoid' `idVar'
    rename objectid gisID
    keep gisID `idVar' `longitudeVar' `latitudeVar'
    rename `longitudeVar' longitude
    rename `latitudeVar' latitude 

    sort gisID  
    save `coordinatesTable', replace


    * Import the distance matrix between county centroids 
    * Use the stringcols option to preserve leading zeros in your id variable
    import delimited "`rootDir'/`PATH_GIS'/`distanceFile'", /// 
           clear stringcols(`distanceKEEP_STRINGS')

    rename `geoid' `idVar'Prime 
    rename `geoid'_1 `idVar'
    * This would be the place to convert, if needed, the distance units.
    label var `distanceVar' "Distance (in `distanceUnit')"

    keep `idVar'Prime `idVar' `distanceVar' 
    sort `idVar'Prime `idVar' 
    save `distanceTable', replace 


    * Import the area of each spatial unit 
    import delimited "`rootDir'/`PATH_GIS'/`areaFile'", /// 
           clear stringcols(`areaKEEP_STRINGS')

    rename `geoid' `idVar'  

    keep `idVar' `areaVar'
    label var `areaVar' "Area (in `areaUnit')"

    sort `idVar' 
    save `areaTable', replace 

    * Now that we have the distance between centroids, and the area of each 
    * spatial unit stored in memory we can continue to classify the neighboring 
    * relationship between the spatial units. We need to prepare one more file. 

    * Import the information about the neighboring lines, the lines that identify 
    * the spatial units that share a border. 
    import delimited "`rootDir'/`PATH_GIS'/`neighbroingLinesFile'", clear 

    keep objectid left_fid right_fid 
    * Drop the lines that are on the edge, meaning that there is no further spatial 
    * unit beyond that line. 
    drop if left_fid < 0 | right_fid < 0
    * No point in having duplicate line sets that identify the same neighboring 
    * spatial unit so we keep only the unique combinations.
    bysort left_fid right_fid: keep if _n == 1
    rename left_fid gisID 
    rename right_fid neighborID
    * We now have the the neighbors to the right of each spatial unit
    sort gisID 
    save `neighboringCounties', replace 

    * Repeating to find the neighbors on the left of each spatial unit 
    import delimited "`rootDir'/`PATH_GIS'/`neighbroingLinesFile'", clear 

    keep objectid left_fid right_fid 
    drop if left_fid < 0 | right_fid < 0
    bysort left_fid right_fid: keep if _n == 1
    rename right_fid gisID 
    rename left_fid neighborID 
    * We now have the neighbors to the left of each spatial unit 
    sort gisID 

    * Combine the neighbors to the left and to the right of each spatial unit by 
    * appending the previous file saved. 
    append using `neighboringCounties'
    keep gisID neighborID

    save `neighboringCounties', replace 


    * We now have all the needed files ready and we can start the classification.
    use `coordinatesTable', clear 

    sort `idVar'
    merge 1:1 `idVar' using `areaTable'
    keep if _merge == 3 
    drop _merge 

    rename `idVar' `idVar'Prime 
    rename longitude longitudePrime
    rename latitude latitudePrime 
    rename `areaVar' areaPrime 

    * Each observation is a spatial unit that serves as an anchor, denoted as PRIME. 
    * Now we start identifying the neighboring units of each PRIME unit.   

    * Match spatial units based on their GIS objectid numbers 
    merge 1:m gisID using `neighboringCounties'
    keep if _merge == 3 
    drop _merge 
    * This step just gave us the matching between each unique prime spatial unit 
    * and all its first degree neighbors. Next we want to add the data about the 
    * coordinates of the first degree neighbors and the distance between the prime 
    * units and the first degree neighbors. 

    * Flag the neighborID as the GIS objectid number (gisID)
    drop gisID
    rename neighborID gisID 
    sort gisID 
    * Match the gisID with the idVar and geolocation data 
    merge m:1 gisID using `coordinatesTable'
    keep if _merge == 3
    drop _merge  
    * Get the distance data 
    sort `idVar'Prime `idVar'
    merge 1:1 `idVar'Prime `idVar' using `distanceTable'
    keep if _merge == 3
    drop _merge
    * Get the area data 
    sort `idVar'
    merge m:1 `idVar' using `areaTable'
    keep if _merge == 3
    drop _merge 

    * Rename 
    rename `idVar' `idVar'Neighbor1
    rename longitude longitudeNeighbor1 
    rename latitude latitudeNeighbor1 
    rename `distanceVar' distanceNeighbor1 
    rename `areaVar' areaNeighbor1 

    * Next we are going to repeat the above process for second degree neighbors and 
    * so on, up to the neighboring degree determined by the maxNeighborDegree 
    * parameter. 

    * This will simply be used to order the variables nicely at the end.
    local sortOrder = ""
    forvalues degreeN = 2(1)`maxNeighborDegree' {
        sort gisID 
        joinby gisID using `neighboringCounties'

        drop gisID
        rename neighborID gisID 
        sort gisID 

        merge m:1 gisID using `coordinatesTable'
        keep if _merge == 3
        drop _merge  

        sort `idVar'Prime `idVar'
        merge m:1 `idVar'Prime `idVar' using `distanceTable'
        drop if _merge == 2
        drop _merge

        sort `idVar' 
        merge m:1 `idVar' using `areaTable'
        keep if _merge == 3 
        drop _merge 

        rename `idVar' `idVar'Neighbor`degreeN'
        rename longitude longitudeNeighbor`degreeN'
        rename latitude latitudeNeighbor`degreeN'
        rename `distanceVar' distanceNeighbor`degreeN'
        rename `areaVar' areaNeighbor`degreeN' 

        local sortOrder = "`sortOrder' " + "`idVar'Neighbor`degreeN'"
    }

    sort `idVar'Prime `idVar'Neighbor1 `sortOrder'

    * Removing duplicates. 
    * This process takes the most amount of time relative to the other pieces and 
    * this is also the part that does not scale well, meaning that computational 
    * time increases exponentially with the number of max neighbor degree. 

    * Counties should not be neighbors of themselves 
    forvalues degreeN = 2(1)`maxNeighborDegree' {
        drop if `idVar'Prime == `idVar'Neighbor`degreeN'
    }

    * Neighboring counties should only be counted once 
    local maxNeighborDegreeMinus1 = `maxNeighborDegree' -1
    qui levelsof `idVar'Prime, local(`idVar'Levels)
    * The following two locals will help generate a progress bar 
    local idN: word count "``idVar'Levels'"
    local k = 1 
    foreach idPrime in ``idVar'Levels' {
      forvalues degreeN = 1(1)`maxNeighborDegreeMinus1' {
        qui levelsof `idVar'Neighbor`degreeN' if `idVar'Prime == "`idPrime'", local(`idVar'N`degreeN')
        local degreeN_Plus_1 = `degreeN' + 1
        foreach idCodeN`degreeN' in ``idVar'N`degreeN'' {
          qui drop if `idVar'Neighbor`degreeN_Plus_1' == "`idCodeN`degreeN''" /// 
                      & `idVar'Prime == "`idPrime'"    
        }    
      }
      * I included a progress so there will be some meaningful output while this runs 
      local PROGRESS = "Progress (Neighbor): " + string(round(`k'/`idN'*100,0.01)) + "%"
      di "`PROGRESS'"
      local k = `k' + 1
    }

    drop gisID

    * Each spatial unit, identified by `idVar'Prime, has its neighbors of degree 1, 
    * identified by `idVar'Neighbor1, and its neighbors of degree 2, identified by 
    * `idVar'Neighbor2; and so on. 
    * 
    * Notice that the number of observations per-spatial unit is determined by the 
    * number of unique spatial units in the the highest neighboring degree, meaning 
    * there are some duplicates left in the lower levels. 


    * This part makes it easier to merge the resulting file based on `idVar'
    gen `idVar' = `idVar'Prime
    sort `idVar'
    save `neighboringCounties', replace 

end