require(lattice) require(XML) require(quantreg) # Single Item rateSample = read.csv("rateSample.csv", header = TRUE, row.names = "ID") # plot(1:400, apply(rateSample[, 2:6], 1, sum)) fit = lm(Y ~ . - 1, data = rateSample) # couldn't reveal the truth summary(fit) parallel(~ rateSample[, 6:2], alpha = 0.1, # alpha-blending aspect = "xy", # 45-degree banking horizontal.axis = FALSE) # Douban Movie 250 List # See http://en.wikipedia.org/wiki/Internet_Movie_Database # C [6.0, 8.5], step = 0.01, 251 nodes. # m [1300, 25000], step = 10, 2371 nodes. url = "http://movie.douban.com/top250?format=text" topList = htmlTreeParse(url, useInternal = TRUE) rateScore = as.numeric(iconv(sapply(getNodeSet(doc = topList, path = "//td[@headers='m_rating_score']"), xmlValue), from = 'UTF-8', to = '')) rateNum = as.numeric(iconv(sapply(getNodeSet(doc = topList, path = "//td[@headers='m_rating_num']"), xmlValue), from = 'UTF-8', to = '')) RV = rateScore * rateNum cGrid = rep(seq(6.0, 8.5, 0.01), 2371) mGrid = rep(seq(1300, 25000, 10), 251) resMat = matrix(NA, nrow = 251 * 2371, ncol = 250) for (i in 1:nrow(resMat)) { resMat[i, ] = sort( (RV + (cGrid[i] * mGrid[i]))/(rateNum + mGrid[i]), index.return = TRUE, decreasing = TRUE)$ix } # L1-norm resVec = rep(NA, nrow(resMat)) for (i in 1:length(resVec)) { resVec[i] = as.numeric( dist(rbind(1:250, resMat[i, ]), method = "manhattan")) } which(resVec == min(resVec)) # GMT+8 2011/12/24 18:00 # [1] 28617 28618 161404 161406 161407 315515 315516 469626 469627 cGrid[which(resVec == min(resVec))] # [1] 6.02 6.03 6.10 6.12 6.13 6.07 6.08 6.04 6.05 mGrid[which(resVec == min(resVec))] # [1] 2940 2950 3050 3070 3080 3010 3020 2970 2980 # L2-norm resVec = rep(NA, nrow(resMat)) for (i in 1:length(resVec)) { resVec[i] = as.numeric( dist(rbind(1:250, resMat[i, ]), method = "euclidean")) } which(resVec == min(resVec)) # [1] 315515 cGrid[which(resVec == min(resVec))] # [1] 6.07 mGrid[which(resVec == min(resVec))] # [1] 3010 # Quantile Regression for Rank Shift x = 1:250 y = abs(resMat[315515, ] - 1:250) plot(x, y, cex = 0.25, type = "n", ylim = c(0, 25), xlab = "Original Rank", ylab = "Absolute Rank Shift") points(x, y, cex = 0.5, col = "darkgrey") abline(rq(y ~ x, tau = 0.5), col = "black") # Median Est. abline(lm(y ~ x), lty = 2, col = "red") # Least Square Est. taus = c(0.1, 0.25, 0.75, 0.90) for(i in 1:length(taus)) { abline(rq(y ~ x, tau = taus[i]), col = "darkgrey") }