![]() |
Number of tweets in different languages posted around Germany |
Collecting the geocoded tweets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package testing; | |
import java.io.BufferedWriter; | |
import java.io.File; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.text.SimpleDateFormat; | |
import java.util.Date; | |
import twitter4j.FilterQuery; | |
import twitter4j.RawStreamListener; | |
import twitter4j.StallWarning; | |
import twitter4j.Status; | |
import twitter4j.StatusDeletionNotice; | |
import twitter4j.StatusListener; | |
import twitter4j.TwitterException; | |
import twitter4j.TwitterStream; | |
import twitter4j.TwitterStreamFactory; | |
public class RecordGermanTweets { | |
static final SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH_mm_ss"); | |
static final long deltaT = 1000 * 60 * 60; // Time in msec, when a new file should be craeted | |
public static void main(String[] args) throws TwitterException, IOException, InterruptedException { | |
long lastWritten = 0; | |
StatusWriter listener = null; | |
while (true) { | |
long start = System.currentTimeMillis(); | |
if (start - lastWritten > deltaT) { | |
String fileName = df.format(new Date()) + ".json"; | |
File file = new File(fileName); | |
System.err.println("Writing to " + file.getAbsolutePath()); | |
if (listener == null) { // First File | |
listener = new StatusWriter(file); | |
TwitterStreamFactory twsf = new TwitterStreamFactory(SimpleQuery.buildConfig().build()); | |
TwitterStream twitterStream = twsf.getInstance(); | |
twitterStream.addListener((StatusListener) listener); | |
twitterStream.addListener((RawStreamListener) listener); | |
FilterQuery fq = new FilterQuery(); | |
double[][] locations = { { 5, 45 }, { 18, 56 } }; // Region around germany | |
fq.locations(locations); | |
System.err.println("Listening to " + fq); | |
twitterStream.filter(fq); | |
} else { | |
System.err.println("Writing to new file [" + file + "]"); | |
listener.setFileName(file); | |
} | |
lastWritten = System.currentTimeMillis(); | |
Thread.sleep(10000); | |
} | |
} | |
} | |
static class StatusWriter implements StatusListener, RawStreamListener { | |
private File file; | |
public StatusWriter(File file) { | |
this.file = file; | |
} | |
public void setFileName(File file) { | |
this.file = file; | |
} | |
@Override | |
public void onMessage(String rawString) { | |
try { | |
BufferedWriter w = new BufferedWriter(new FileWriter(file, true)); | |
w.write(rawString + "\n"); | |
w.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
public void onStatus(Status status) { | |
} | |
@Override | |
public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) { | |
} | |
@Override | |
public void onTrackLimitationNotice(int numberOfLimitedStatuses) { | |
System.out.println("Got track limitation notice:" + numberOfLimitedStatuses); | |
} | |
@Override | |
public void onScrubGeo(long userId, long upToStatusId) { | |
System.out.println("Got scrub_geo event userId:" + userId + " upToStatusId:" + upToStatusId); | |
} | |
@Override | |
public void onStallWarning(StallWarning warning) { | |
System.out.println("Got stall warning:" + warning); | |
} | |
@Override | |
public void onException(Exception ex) { | |
ex.printStackTrace(); | |
} | |
} | |
} |
{"created_at":"Tue May 21 17:51:09 +0000 2013","id":336901993555709952,"id_str":"336901993555709952","text":"@OmegaBlue69 ... {"created_at":"Tue May 21 17:51:10 +0000 2013","id":336901996680450048,"id_str":"336901996680450048","text":"Sweet1 ....
Handling the json-file
The first task extracts the relevant information from these files. The following script reads the json files line by line and writes the coordinates, languages and for each tweet to a text-file e.g. "2013-05-21T19_51_03.coords.txt" using rjson.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rjson) | |
library(maps) | |
fromDir = "E:/Twitter/"; | |
setwd("c:/Daten/Twitter/tmp/") | |
readCoords <- function (inFile) { | |
#inFile <- "2013-05-21T17_56_16.json" #Uncomment in final run | |
print(paste0("reading ", inFile)) | |
lines <- readLines(inFile) | |
rows <- length(lines) | |
coord <- matrix(rep(0, 2*rows), nrow=rows) | |
lang <- matrix(rep("", rows), nrow=rows) | |
txt <- matrix(rep("", rows), nrow=rows) | |
ids <- matrix(rep("", rows), nrow=rows) | |
for (r in 1:rows) { | |
tryCatch(res <- fromJSON(lines[r]), error = function(e) print(e)) | |
tryCatch( | |
if (!is.null(res$geo)) { | |
g <- res$coordinates | |
if (g$type == "Point") { | |
lon = g$coordinates[1] | |
lat = g$coordinates[2] | |
if (is.numeric(lon) && is.numeric(lat)) { | |
coord[r,] <- c(lon, lat) | |
} | |
} | |
lang[r] <- res$lang | |
txt[r] <- res$text | |
ids[r] <- res$id_str | |
} | |
, error = function(e) print(paste0("Reading ", e))) | |
} | |
print(paste0("Number of Tweets ", dim(coord)[1])) | |
uniqe <- duplicated(ids) == FALSE | |
df <- data.frame(coord[uniqe, ], as.factor(lang[uniqe]), txt[uniqe], row.names=(ids[uniqe]), stringsAsFactors=FALSE) | |
colnames(df) <- c("long", "lat", "lang", "text") | |
return (df) | |
} | |
#Testing with small file | |
#write.table(x=readCoords("2013-05-21T17_56_16.coords.txt"), file="2013-05-21T17_56_16.json.coords.txt") | |
#files <- list.files(path = ".", pattern = ".+json") | |
files <- list.files(path = fromDir, pattern = ".+json") | |
for (file in files) { | |
fileOut <- sub('.json', '', file) | |
fileOut <- paste0(fileOut, ".coords.txt") | |
if (file.exists(fileOut)) { | |
print(paste0("Skipping ... ", fileOut)) | |
next | |
} | |
longName = paste0(fromDir, file) | |
print(paste0("Reading file ", longName)) | |
cc <- readCoords(longName) | |
write.table(x=cc, file=fileOut) | |
} | |
Putting it all together
The next script picks up all text files with coordinate information, merges infrequent levels and does the color-coding. Finally it creates a simple barplot and stores the data in a data.frame all.data and the colors in a vector cols
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rjson) | |
library(maps) | |
fromDir = "E:/Twitter/"; | |
setwd("c:/Daten/Twitter/tmp/") | |
readCoords <- function (inFile) { | |
#inFile <- "2013-05-21T17_56_16.json" #Uncomment in final run | |
print(paste0("reading ", inFile)) | |
lines <- readLines(inFile) | |
rows <- length(lines) | |
coord <- matrix(rep(0, 2*rows), nrow=rows) | |
lang <- matrix(rep("", rows), nrow=rows) | |
txt <- matrix(rep("", rows), nrow=rows) | |
ids <- matrix(rep("", rows), nrow=rows) | |
for (r in 1:rows) { | |
tryCatch(res <- fromJSON(lines[r]), error = function(e) print(e)) | |
tryCatch( | |
if (!is.null(res$geo)) { | |
g <- res$coordinates | |
if (g$type == "Point") { | |
lon = g$coordinates[1] | |
lat = g$coordinates[2] | |
if (is.numeric(lon) && is.numeric(lat)) { | |
coord[r,] <- c(lon, lat) | |
} | |
} | |
lang[r] <- res$lang | |
txt[r] <- res$text | |
ids[r] <- res$id_str | |
} | |
, error = function(e) print(paste0("Reading ", e))) | |
} | |
print(paste0("Number of Tweets ", dim(coord)[1])) | |
uniqe <- duplicated(ids) == FALSE | |
df <- data.frame(coord[uniqe, ], as.factor(lang[uniqe]), txt[uniqe], row.names=(ids[uniqe]), stringsAsFactors=FALSE) | |
colnames(df) <- c("long", "lat", "lang", "text") | |
return (df) | |
} | |
#Testing with small file | |
#write.table(x=readCoords("2013-05-21T17_56_16.coords.txt"), file="2013-05-21T17_56_16.json.coords.txt") | |
#files <- list.files(path = ".", pattern = ".+json") | |
files <- list.files(path = fromDir, pattern = ".+json") | |
for (file in files) { | |
fileOut <- sub('.json', '', file) | |
fileOut <- paste0(fileOut, ".coords.txt") | |
if (file.exists(fileOut)) { | |
print(paste0("Skipping ... ", fileOut)) | |
next | |
} | |
longName = paste0(fromDir, file) | |
print(paste0("Reading file ", longName)) | |
cc <- readCoords(longName) | |
write.table(x=cc, file=fileOut) | |
} | |