[{"data":1,"prerenderedAt":3478},["ShallowReactive",2],{"graph_subreddit":3},{"id":4,"title":5,"body":6,"comments":171,"date":3461,"description":3462,"draft":3463,"extension":3464,"external":3465,"image":3466,"meta":3467,"navigation":171,"path":3469,"seo":3470,"stem":3471,"tags":3472,"__hash__":3477},"blog/2017/03/03/graph_subreddit.md","Related subreddit graph exploration with NetworkX",{"type":7,"value":8,"toc":3459},"minimark",[9,14,23,26,38,109,346,352,604,607,610,618,623,631,634,640,648,919,922,938,941,1754,1757,1793,1800,1803,1809,1812,1842,1845,2040,2047,2051,2060,2084,2106,2139,2161,2200,2226,2232,2235,2255,2261,2265,2268,2286,2292,2295,2442,2449,2458,2464,2472,2478,2486,2492,2500,2506,2514,2520,2528,2534,2542,2548,2556,2562,2570,2576,2584,2590,2598,2604,2612,2618,2626,2632,2640,2646,2661,2667,2679,2686,2836,2839,2844,2848,2856,2861,2868,2883,2886,2921,2927,2962,2967,2970,3031,3090,3096,3099,3178,3183,3187,3190,3231,3237,3261,3267,3311,3317,3320,3374,3380,3383,3412,3418,3427,3455],[10,11,13],"h1",{"id":12},"graphing-subreddits","Graphing Subreddits",[15,16,17,18,22],"p",{},"This notebook explores some basic concepts of graph theory. A few weeks ago I set up a script to scrape data from ",[19,20,21],"a",{"href":21},"reddit.com"," with the goal of visualizing the network of related subreddits (forums on specific topics) and related data.",[15,24,25],{},"Reddit is home over 600,000 communities, known as subreddits, where people come to share information, opinions, links, etc. and discuss things in a open forum. Most subreddits display links to related subreddits. For example, /r/apple (the Apple subreddit) links to /r/iPhone, a subreddit all about the iPhone, and over a dozen other Apple-related subreddits.",[15,27,28,29,33,34,37],{},"If you visit reddit.com as a guest, you will see a list of popular subreddits. This list is located inside an ",[30,31,32],"code",{},"html"," tag called ",[30,35,36],{},"drop-choices",". Here it is:",[39,40,45],"pre",{"className":41,"code":42,"language":43,"meta":44,"style":44},"language-python shiki shiki-themes github-light github-dark monokai","from selenium import webdriver\nimport re\nimport time\nimport numpy as np\nfrom bs4 import BeautifulSoup\n","python","",[30,46,47,66,74,82,96],{"__ignoreMap":44},[48,49,52,56,60,63],"span",{"class":50,"line":51},"line",1,[48,53,55],{"class":54},"sC2Qs","from",[48,57,59],{"class":58},"sMOD_"," selenium ",[48,61,62],{"class":54},"import",[48,64,65],{"class":58}," webdriver\n",[48,67,69,71],{"class":50,"line":68},2,[48,70,62],{"class":54},[48,72,73],{"class":58}," re\n",[48,75,77,79],{"class":50,"line":76},3,[48,78,62],{"class":54},[48,80,81],{"class":58}," time\n",[48,83,85,87,90,93],{"class":50,"line":84},4,[48,86,62],{"class":54},[48,88,89],{"class":58}," numpy ",[48,91,92],{"class":54},"as",[48,94,95],{"class":58}," np\n",[48,97,99,101,104,106],{"class":50,"line":98},5,[48,100,55],{"class":54},[48,102,103],{"class":58}," bs4 ",[48,105,62],{"class":54},[48,107,108],{"class":58}," BeautifulSoup\n",[39,110,112],{"className":41,"code":111,"language":43,"meta":44,"style":44},"driver = webdriver.PhantomJS()\ndriver.get('https://www.reddit.com/')\ntime.sleep(4 + np.random.random())\nhtml = driver.page_source.encode('utf-8')\n\ns = BeautifulSoup(html)\ndefaults = s.find('div', attrs={'class':'drop-choices'})\nsubs = re.compile(r\"\\/r\\/[\\w.]+\\/?\")\ndefault_subreddits = list(set(subs.findall(str(defaults))))\n\nfor x in default_subreddits: print '[' + x + '](https://reddit.com'+ x + '), ',\n",[30,113,114,125,137,152,167,173,184,222,271,298,303],{"__ignoreMap":44},[48,115,116,119,122],{"class":50,"line":51},[48,117,118],{"class":58},"driver ",[48,120,121],{"class":54},"=",[48,123,124],{"class":58}," webdriver.PhantomJS()\n",[48,126,127,130,134],{"class":50,"line":68},[48,128,129],{"class":58},"driver.get(",[48,131,133],{"class":132},"sstjo","'https://www.reddit.com/'",[48,135,136],{"class":58},")\n",[48,138,139,142,146,149],{"class":50,"line":76},[48,140,141],{"class":58},"time.sleep(",[48,143,145],{"class":144},"s7F3e","4",[48,147,148],{"class":54}," +",[48,150,151],{"class":58}," np.random.random())\n",[48,153,154,157,159,162,165],{"class":50,"line":84},[48,155,156],{"class":58},"html ",[48,158,121],{"class":54},[48,160,161],{"class":58}," driver.page_source.encode(",[48,163,164],{"class":132},"'utf-8'",[48,166,136],{"class":58},[48,168,169],{"class":50,"line":98},[48,170,172],{"emptyLinePlaceholder":171},true,"\n",[48,174,176,179,181],{"class":50,"line":175},6,[48,177,178],{"class":58},"s ",[48,180,121],{"class":54},[48,182,183],{"class":58}," BeautifulSoup(html)\n",[48,185,187,190,192,195,198,201,205,207,210,213,216,219],{"class":50,"line":186},7,[48,188,189],{"class":58},"defaults ",[48,191,121],{"class":54},[48,193,194],{"class":58}," s.find(",[48,196,197],{"class":132},"'div'",[48,199,200],{"class":58},", ",[48,202,204],{"class":203},"sTHNf","attrs",[48,206,121],{"class":54},[48,208,209],{"class":58},"{",[48,211,212],{"class":132},"'class'",[48,214,215],{"class":58},":",[48,217,218],{"class":132},"'drop-choices'",[48,220,221],{"class":58},"})\n",[48,223,225,228,230,233,237,240,244,247,249,252,256,259,262,264,267,269],{"class":50,"line":224},8,[48,226,227],{"class":58},"subs ",[48,229,121],{"class":54},[48,231,232],{"class":58}," re.compile(",[48,234,236],{"class":235},"sq6CD","r",[48,238,239],{"class":132},"\"",[48,241,243],{"class":242},"sHuvb","\\/",[48,245,236],{"class":246},"sFxd3",[48,248,243],{"class":242},[48,250,251],{"class":144},"[",[48,253,255],{"class":254},"sLkwE","\\w",[48,257,258],{"class":144},".]",[48,260,261],{"class":54},"+",[48,263,243],{"class":242},[48,265,266],{"class":54},"?",[48,268,239],{"class":132},[48,270,136],{"class":58},[48,272,274,277,279,283,286,289,292,295],{"class":50,"line":273},9,[48,275,276],{"class":58},"default_subreddits ",[48,278,121],{"class":54},[48,280,282],{"class":281},"s-m8C"," list",[48,284,285],{"class":58},"(",[48,287,288],{"class":281},"set",[48,290,291],{"class":58},"(subs.findall(",[48,293,294],{"class":281},"str",[48,296,297],{"class":58},"(defaults))))\n",[48,299,301],{"class":50,"line":300},10,[48,302,172],{"emptyLinePlaceholder":171},[48,304,306,309,312,315,318,322,325,327,329,331,334,336,338,340,343],{"class":50,"line":305},11,[48,307,308],{"class":54},"for",[48,310,311],{"class":58}," x ",[48,313,314],{"class":54},"in",[48,316,317],{"class":58}," default_subreddits: ",[48,319,321],{"class":320},"sTrkL","print",[48,323,324],{"class":132}," '['",[48,326,148],{"class":54},[48,328,311],{"class":58},[48,330,261],{"class":54},[48,332,333],{"class":132}," '](https://reddit.com'",[48,335,261],{"class":54},[48,337,311],{"class":58},[48,339,261],{"class":54},[48,341,342],{"class":132}," '), '",[48,344,345],{"class":58},",\n",[15,347,348,349,215],{},"Here are the elements of ",[30,350,351],{},"default_subreddits",[353,354,355],"blockquote",{},[15,356,357,200,363,200,368,200,373,200,378,200,383,200,388,200,393,200,398,200,403,200,408,200,413,200,418,200,423,200,428,200,433,200,438,200,443,200,448,200,453,200,458,200,463,200,468,200,473,200,478,200,483,200,488,200,493,200,498,200,503,200,508,200,513,200,518,200,523,200,528,200,533,200,538,200,543,200,548,200,553,200,558,200,563,200,568,200,573,200,578,200,583,200,588,200,593,200,598,603],{},[19,358,362],{"href":359,"rel":360},"https://reddit.com/r/LifeProTips/",[361],"nofollow","/r/LifeProTips/",[19,364,367],{"href":365,"rel":366},"https://reddit.com/r/Futurology/",[361],"/r/Futurology/",[19,369,372],{"href":370,"rel":371},"https://reddit.com/r/OldSchoolCool/",[361],"/r/OldSchoolCool/",[19,374,377],{"href":375,"rel":376},"https://reddit.com/r/mildlyinteresting/",[361],"/r/mildlyinteresting/",[19,379,382],{"href":380,"rel":381},"https://reddit.com/r/askscience/",[361],"/r/askscience/",[19,384,387],{"href":385,"rel":386},"https://reddit.com/r/UpliftingNews/",[361],"/r/UpliftingNews/",[19,389,392],{"href":390,"rel":391},"https://reddit.com/r/aww/",[361],"/r/aww/",[19,394,397],{"href":395,"rel":396},"https://reddit.com/r/GetMotivated/",[361],"/r/GetMotivated/",[19,399,402],{"href":400,"rel":401},"https://reddit.com/r/personalfinance/",[361],"/r/personalfinance/",[19,404,407],{"href":405,"rel":406},"https://reddit.com/r/gadgets/",[361],"/r/gadgets/",[19,409,412],{"href":410,"rel":411},"https://reddit.com/r/science/",[361],"/r/science/",[19,414,417],{"href":415,"rel":416},"https://reddit.com/r/dataisbeautiful/",[361],"/r/dataisbeautiful/",[19,419,422],{"href":420,"rel":421},"https://reddit.com/r/DIY/",[361],"/r/DIY/",[19,424,427],{"href":425,"rel":426},"https://reddit.com/r/AskReddit/",[361],"/r/AskReddit/",[19,429,432],{"href":430,"rel":431},"https://reddit.com/r/space/",[361],"/r/space/",[19,434,437],{"href":435,"rel":436},"https://reddit.com/r/nosleep/",[361],"/r/nosleep/",[19,439,442],{"href":440,"rel":441},"https://reddit.com/r/Documentaries/",[361],"/r/Documentaries/",[19,444,447],{"href":445,"rel":446},"https://reddit.com/r/todayilearned/",[361],"/r/todayilearned/",[19,449,452],{"href":450,"rel":451},"https://reddit.com/r/television/",[361],"/r/television/",[19,454,457],{"href":455,"rel":456},"https://reddit.com/r/IAmA/",[361],"/r/IAmA/",[19,459,462],{"href":460,"rel":461},"https://reddit.com/r/Art/",[361],"/r/Art/",[19,464,467],{"href":465,"rel":466},"https://reddit.com/r/EarthPorn/",[361],"/r/EarthPorn/",[19,469,472],{"href":470,"rel":471},"https://reddit.com/r/books/",[361],"/r/books/",[19,474,477],{"href":475,"rel":476},"https://reddit.com/r/gifs/",[361],"/r/gifs/",[19,479,482],{"href":480,"rel":481},"https://reddit.com/r/Showerthoughts/",[361],"/r/Showerthoughts/",[19,484,487],{"href":485,"rel":486},"https://reddit.com/r/blog/",[361],"/r/blog/",[19,489,492],{"href":490,"rel":491},"https://reddit.com/r/news/",[361],"/r/news/",[19,494,497],{"href":495,"rel":496},"https://reddit.com/r/Jokes/",[361],"/r/Jokes/",[19,499,502],{"href":500,"rel":501},"https://reddit.com/r/TwoXChromosomes/",[361],"/r/TwoXChromosomes/",[19,504,507],{"href":505,"rel":506},"https://reddit.com/r/videos/",[361],"/r/videos/",[19,509,512],{"href":510,"rel":511},"https://reddit.com/r/philosophy/",[361],"/r/philosophy/",[19,514,517],{"href":515,"rel":516},"https://reddit.com/r/nottheonion/",[361],"/r/nottheonion/",[19,519,522],{"href":520,"rel":521},"https://reddit.com/r/explainlikeimfive/",[361],"/r/explainlikeimfive/",[19,524,527],{"href":525,"rel":526},"https://reddit.com/r/movies/",[361],"/r/movies/",[19,529,532],{"href":530,"rel":531},"https://reddit.com/r/Music/",[361],"/r/Music/",[19,534,537],{"href":535,"rel":536},"https://reddit.com/r/WritingPrompts/",[361],"/r/WritingPrompts/",[19,539,542],{"href":540,"rel":541},"https://reddit.com/r/worldnews/",[361],"/r/worldnews/",[19,544,547],{"href":545,"rel":546},"https://reddit.com/r/pics/",[361],"/r/pics/",[19,549,552],{"href":550,"rel":551},"https://reddit.com/r/history/",[361],"/r/history/",[19,554,557],{"href":555,"rel":556},"https://reddit.com/r/listentothis/",[361],"/r/listentothis/",[19,559,562],{"href":560,"rel":561},"https://reddit.com/r/sports/",[361],"/r/sports/",[19,564,567],{"href":565,"rel":566},"https://reddit.com/r/food/",[361],"/r/food/",[19,569,572],{"href":570,"rel":571},"https://reddit.com/r/creepy/",[361],"/r/creepy/",[19,574,577],{"href":575,"rel":576},"https://reddit.com/r/announcements/",[361],"/r/announcements/",[19,579,582],{"href":580,"rel":581},"https://reddit.com/r/gaming/",[361],"/r/gaming/",[19,584,587],{"href":585,"rel":586},"https://reddit.com/r/tifu/",[361],"/r/tifu/",[19,589,592],{"href":590,"rel":591},"https://reddit.com/r/funny/",[361],"/r/funny/",[19,594,597],{"href":595,"rel":596},"https://reddit.com/r/photoshopbattles/",[361],"/r/photoshopbattles/",[19,599,602],{"href":600,"rel":601},"https://reddit.com/r/InternetIsBeautiful/",[361],"/r/InternetIsBeautiful/",",",[15,605,606],{},"My goal here is to see how many subreddits we can reach as we branch off of these \"default\" subreddits into their related subreddits.",[15,608,609],{},"First, we need to set up data structures to hold data for subreddits and their related subreddits. And we need to define an algorithm for collecting data.",[15,611,612,613,215],{},"Here's an intrdoduction to graphs from ",[19,614,617],{"href":615,"rel":616},"https://www.python.org/doc/essays/graphs/",[361],"python.org",[353,619,620],{},[15,621,622],{},"Few programming languages provide direct support for graphs as a data type, and Python is no exception. However, graphs are easily built out of lists and dictionaries. For instance, here's a simple graph (I can't use drawings in these columns, so I write down the graph's arcs):",[39,624,629],{"className":625,"code":627,"language":628,"meta":44},[626],"language-text","A -> B\nA -> C\nB -> C\nB -> D\nC -> D\nD -> C\nE -> F\nF -> C\n","text",[30,630,627],{"__ignoreMap":44},[15,632,633],{},"This graph has six nodes (A-F) and eight arcs. It can be represented by the following Python data structure:",[39,635,638],{"className":636,"code":637,"language":628,"meta":44},[626],"graph =     {'A': ['B', 'C'],\n             'B': ['C', 'D'],\n             'C': ['D'],\n             'D': ['C'],\n             'E': ['F'],\n             'F': ['C']}\n",[30,639,637],{"__ignoreMap":44},[15,641,642,643,647],{},"First let's define how we would go only one branch deep into this graph (i.e. find the related subreddits for ",[644,645,646],"em",{},"only"," the default subreddits). To collect the data, I first looped through the default subreddits and save the html of each subreddit to its own text file. Here's a script with comments:",[39,649,651],{"className":41,"code":650,"language":43,"meta":44,"style":44},"#first we navigate to the correct folder where we will store the first level of related subreddits\nos.chdir(os.path.expanduser('~/Documents/Projects/Data/Subreddits/one/'))\n\n#next we instantiate the webdriver we will be using: PhantomJS\ndriver = webdriver.PhantomJS()\n\n#loop through the list of default subreddits\nfor num, subreddit in enumerate(default_subreddits):\n\n    #for each subreddit, we append the /r/subreddit path to the base URL (reddit.com)\n    driver.get('https://www.reddit.com'+subreddit)\n\n    #wait for two seconds\n    time.sleep(2 + np.random.random())\n\n    #save the html of the loaded page to a variable: html\n    html = driver.page_source.encode('utf-8')\n\n    #remove '/r/' from the subreddit name string\n    name = subreddit.split('/')[2]\n\n    #open a new file and give it the name of the subreddit we just scraped\n    subreddit_html_file = open(name+'.txt', 'w+')\n\n    #write the html contents to the file\n    subreddit_html_file.write(html)\n\n    #clost the file\n    subreddit_html_file.close()\n\n    #print out the number and name of the subreddit we just scrapped to make sure things are working\n    print str(num) + ' ' + subreddit,\n\n",[30,652,653,659,670,674,679,687,691,696,711,715,720,733,738,744,757,762,768,782,787,793,815,820,826,852,857,863,869,874,880,886,891,897],{"__ignoreMap":44},[48,654,655],{"class":50,"line":51},[48,656,658],{"class":657},"s8-w5","#first we navigate to the correct folder where we will store the first level of related subreddits\n",[48,660,661,664,667],{"class":50,"line":68},[48,662,663],{"class":58},"os.chdir(os.path.expanduser(",[48,665,666],{"class":132},"'~/Documents/Projects/Data/Subreddits/one/'",[48,668,669],{"class":58},"))\n",[48,671,672],{"class":50,"line":76},[48,673,172],{"emptyLinePlaceholder":171},[48,675,676],{"class":50,"line":84},[48,677,678],{"class":657},"#next we instantiate the webdriver we will be using: PhantomJS\n",[48,680,681,683,685],{"class":50,"line":98},[48,682,118],{"class":58},[48,684,121],{"class":54},[48,686,124],{"class":58},[48,688,689],{"class":50,"line":175},[48,690,172],{"emptyLinePlaceholder":171},[48,692,693],{"class":50,"line":186},[48,694,695],{"class":657},"#loop through the list of default subreddits\n",[48,697,698,700,703,705,708],{"class":50,"line":224},[48,699,308],{"class":54},[48,701,702],{"class":58}," num, subreddit ",[48,704,314],{"class":54},[48,706,707],{"class":320}," enumerate",[48,709,710],{"class":58},"(default_subreddits):\n",[48,712,713],{"class":50,"line":273},[48,714,172],{"emptyLinePlaceholder":171},[48,716,717],{"class":50,"line":300},[48,718,719],{"class":657},"    #for each subreddit, we append the /r/subreddit path to the base URL (reddit.com)\n",[48,721,722,725,728,730],{"class":50,"line":305},[48,723,724],{"class":58},"    driver.get(",[48,726,727],{"class":132},"'https://www.reddit.com'",[48,729,261],{"class":54},[48,731,732],{"class":58},"subreddit)\n",[48,734,736],{"class":50,"line":735},12,[48,737,172],{"emptyLinePlaceholder":171},[48,739,741],{"class":50,"line":740},13,[48,742,743],{"class":657},"    #wait for two seconds\n",[48,745,747,750,753,755],{"class":50,"line":746},14,[48,748,749],{"class":58},"    time.sleep(",[48,751,752],{"class":144},"2",[48,754,148],{"class":54},[48,756,151],{"class":58},[48,758,760],{"class":50,"line":759},15,[48,761,172],{"emptyLinePlaceholder":171},[48,763,765],{"class":50,"line":764},16,[48,766,767],{"class":657},"    #save the html of the loaded page to a variable: html\n",[48,769,771,774,776,778,780],{"class":50,"line":770},17,[48,772,773],{"class":58},"    html ",[48,775,121],{"class":54},[48,777,161],{"class":58},[48,779,164],{"class":132},[48,781,136],{"class":58},[48,783,785],{"class":50,"line":784},18,[48,786,172],{"emptyLinePlaceholder":171},[48,788,790],{"class":50,"line":789},19,[48,791,792],{"class":657},"    #remove '/r/' from the subreddit name string\n",[48,794,796,799,801,804,807,810,812],{"class":50,"line":795},20,[48,797,798],{"class":58},"    name ",[48,800,121],{"class":54},[48,802,803],{"class":58}," subreddit.split(",[48,805,806],{"class":132},"'/'",[48,808,809],{"class":58},")[",[48,811,752],{"class":144},[48,813,814],{"class":58},"]\n",[48,816,818],{"class":50,"line":817},21,[48,819,172],{"emptyLinePlaceholder":171},[48,821,823],{"class":50,"line":822},22,[48,824,825],{"class":657},"    #open a new file and give it the name of the subreddit we just scraped\n",[48,827,829,832,834,837,840,842,845,847,850],{"class":50,"line":828},23,[48,830,831],{"class":58},"    subreddit_html_file ",[48,833,121],{"class":54},[48,835,836],{"class":320}," open",[48,838,839],{"class":58},"(name",[48,841,261],{"class":54},[48,843,844],{"class":132},"'.txt'",[48,846,200],{"class":58},[48,848,849],{"class":132},"'w+'",[48,851,136],{"class":58},[48,853,855],{"class":50,"line":854},24,[48,856,172],{"emptyLinePlaceholder":171},[48,858,860],{"class":50,"line":859},25,[48,861,862],{"class":657},"    #write the html contents to the file\n",[48,864,866],{"class":50,"line":865},26,[48,867,868],{"class":58},"    subreddit_html_file.write(html)\n",[48,870,872],{"class":50,"line":871},27,[48,873,172],{"emptyLinePlaceholder":171},[48,875,877],{"class":50,"line":876},28,[48,878,879],{"class":657},"    #clost the file\n",[48,881,883],{"class":50,"line":882},29,[48,884,885],{"class":58},"    subreddit_html_file.close()\n",[48,887,889],{"class":50,"line":888},30,[48,890,172],{"emptyLinePlaceholder":171},[48,892,894],{"class":50,"line":893},31,[48,895,896],{"class":657},"    #print out the number and name of the subreddit we just scrapped to make sure things are working\n",[48,898,900,903,906,909,911,914,916],{"class":50,"line":899},32,[48,901,902],{"class":320},"    print",[48,904,905],{"class":281}," str",[48,907,908],{"class":58},"(num) ",[48,910,261],{"class":54},[48,912,913],{"class":132}," ' '",[48,915,148],{"class":54},[48,917,918],{"class":58}," subreddit,\n",[15,920,921],{},"Next, we want to go through each file and extract the information we want. Here's what we will be getting:",[923,924,925,929,932,935],"ul",{},[926,927,928],"li",{},"Number of subscribers",[926,930,931],{},"Subreddit description",[926,933,934],{},"Date created",[926,936,937],{},"Related subreddits",[15,939,940],{},"For this type of project, I prefer to loop through each page and creating several small dictionaries for each data point, then combine the small dictionaries into a large dictionary, and then append the dictionary to a list of dictionaries. Once I have looped through all of the pages, I can create a pandas DataFrame from the list of dictionaries. This allows me to easily manipulate the data. Here's the script that I used to do this:",[39,942,944],{"className":41,"code":943,"language":43,"meta":44,"style":44},"#navigate to where the html files are stored (I moved them around a bit so it is not consistent with the script above)\nos.chdir('E://DATA/Subreddits/subreddits_html/')\n\n#generate a list of files that we will loop through\nfiles = os.listdir('E://DATA/Subreddits/subreddits_html/')\n\n#set up an empty list that we will append dictionaries to\ndict_list = []\n\n#loop through the files\nfor file_ in files:\n\n    #print out the name of the current file in the loop\n    print file_,\n\n    #open the file\n    f = open(file_, 'r')\n    #read the file contents to a local variable\n    html = f.read()\n    #create a BeautifulSoup object that we will use to parse the HTML\n    b = BeautifulSoup(html, 'lxml')\n\n    #get the subreddit name that we are working with (from the `file` variable)\n    subreddit_name = '/r/' + file_[:-4].lower()\n    #put the name into a dictionary\n    subreddit_name_dict = {'subreddit':subreddit_name}\n\n    #get number of subscribers\n    subs = b.find('span', attrs={'class':'subscribers'})\n    #if the number of subscribers is displayed on the page, then we find it and add it to a dictionary\n    if subs:\n        subs = b.find('span', attrs={'class':'subscribers'}).find('span', attrs={'class':'number'}).text.replace(',', '')\n        subs_dict = {'subscribers':int(subs)}\n    #if the number of subscribers is not displayed on the page, then we set the number of subscribers in the dictionary to None\n    else:\n        subs_dict = {'subscribers':None}\n\n    #similar process for the description: if the description is displayed, get it and save it to desc\n    #if it is not available, then desc will be set to `None`\n    desc = b.find('div', attrs={'class':'md'})\n    if desc:\n        desc = b.find('div', attrs={'class':'md'}).text\n        desc = desc.replace('\\n', ' ')\n    desc_dict = {'description':desc}\n\n    #here we use regular expressions to find links anywhere on the page that have the structure: \"/r/something/\"\n    rel_subr = re.compile(r\"\\/r\\/[\\w.]+\\/?\")\n    #make a list of these links based on the \"/r/something/\" pattern\n    related_subreddits = rel_subr.findall(html)\n\n    #save the list to a dictionary\n    subreddits_dict = {'related':related_subreddits}\n\n    #same processes for recording the date that the subreddit was created: get the date from an HTML element,\n    #then save it to a dictionary. There were two different formats available in the HTML so I grabbed both\n    age = b.find('span', attrs={'class':'age'})\n    if age:\n        time1 = age.find('time')['title']\n        time2 = age.find('time')['datetime']\n\n    #save the date to a dictionary\n    time_dict = {\"date1\":time1, \"date2\":time2}\n\n    #take all the dictionaries we just created and put them together into one big dictionary\n    dictionary = dict(subs_dict.items()+desc_dict.items()+subreddits_dict.items()+subreddit_name_dict.items()+time_dict.items())\n\n    #append the big dictionary to the list that we defined right before the beginning of the loop\n    dict_list.append(dictionary)\n\n    #deconstruct the Beautiful Soup object (this can eat up memory very quickly, so it is very important when processing lots of data)\n    b.decompose()\n\n    #clost the file\n    f.close()\n",[30,945,946,951,961,965,970,984,988,993,1003,1007,1012,1024,1028,1033,1040,1044,1049,1066,1071,1080,1085,1100,1104,1109,1132,1137,1153,1157,1162,1192,1197,1205,1263,1283,1289,1298,1317,1322,1328,1334,1363,1371,1400,1425,1441,1446,1452,1488,1494,1505,1510,1516,1532,1537,1543,1549,1578,1586,1607,1626,1631,1637,1659,1664,1670,1704,1709,1715,1721,1726,1732,1738,1743,1748],{"__ignoreMap":44},[48,947,948],{"class":50,"line":51},[48,949,950],{"class":657},"#navigate to where the html files are stored (I moved them around a bit so it is not consistent with the script above)\n",[48,952,953,956,959],{"class":50,"line":68},[48,954,955],{"class":58},"os.chdir(",[48,957,958],{"class":132},"'E://DATA/Subreddits/subreddits_html/'",[48,960,136],{"class":58},[48,962,963],{"class":50,"line":76},[48,964,172],{"emptyLinePlaceholder":171},[48,966,967],{"class":50,"line":84},[48,968,969],{"class":657},"#generate a list of files that we will loop through\n",[48,971,972,975,977,980,982],{"class":50,"line":98},[48,973,974],{"class":58},"files ",[48,976,121],{"class":54},[48,978,979],{"class":58}," os.listdir(",[48,981,958],{"class":132},[48,983,136],{"class":58},[48,985,986],{"class":50,"line":175},[48,987,172],{"emptyLinePlaceholder":171},[48,989,990],{"class":50,"line":186},[48,991,992],{"class":657},"#set up an empty list that we will append dictionaries to\n",[48,994,995,998,1000],{"class":50,"line":224},[48,996,997],{"class":58},"dict_list ",[48,999,121],{"class":54},[48,1001,1002],{"class":58}," []\n",[48,1004,1005],{"class":50,"line":273},[48,1006,172],{"emptyLinePlaceholder":171},[48,1008,1009],{"class":50,"line":300},[48,1010,1011],{"class":657},"#loop through the files\n",[48,1013,1014,1016,1019,1021],{"class":50,"line":305},[48,1015,308],{"class":54},[48,1017,1018],{"class":58}," file_ ",[48,1020,314],{"class":54},[48,1022,1023],{"class":58}," files:\n",[48,1025,1026],{"class":50,"line":735},[48,1027,172],{"emptyLinePlaceholder":171},[48,1029,1030],{"class":50,"line":740},[48,1031,1032],{"class":657},"    #print out the name of the current file in the loop\n",[48,1034,1035,1037],{"class":50,"line":746},[48,1036,902],{"class":320},[48,1038,1039],{"class":58}," file_,\n",[48,1041,1042],{"class":50,"line":759},[48,1043,172],{"emptyLinePlaceholder":171},[48,1045,1046],{"class":50,"line":764},[48,1047,1048],{"class":657},"    #open the file\n",[48,1050,1051,1054,1056,1058,1061,1064],{"class":50,"line":770},[48,1052,1053],{"class":58},"    f ",[48,1055,121],{"class":54},[48,1057,836],{"class":320},[48,1059,1060],{"class":58},"(file_, ",[48,1062,1063],{"class":132},"'r'",[48,1065,136],{"class":58},[48,1067,1068],{"class":50,"line":784},[48,1069,1070],{"class":657},"    #read the file contents to a local variable\n",[48,1072,1073,1075,1077],{"class":50,"line":789},[48,1074,773],{"class":58},[48,1076,121],{"class":54},[48,1078,1079],{"class":58}," f.read()\n",[48,1081,1082],{"class":50,"line":795},[48,1083,1084],{"class":657},"    #create a BeautifulSoup object that we will use to parse the HTML\n",[48,1086,1087,1090,1092,1095,1098],{"class":50,"line":817},[48,1088,1089],{"class":58},"    b ",[48,1091,121],{"class":54},[48,1093,1094],{"class":58}," BeautifulSoup(html, ",[48,1096,1097],{"class":132},"'lxml'",[48,1099,136],{"class":58},[48,1101,1102],{"class":50,"line":822},[48,1103,172],{"emptyLinePlaceholder":171},[48,1105,1106],{"class":50,"line":828},[48,1107,1108],{"class":657},"    #get the subreddit name that we are working with (from the `file` variable)\n",[48,1110,1111,1114,1116,1119,1121,1124,1127,1129],{"class":50,"line":854},[48,1112,1113],{"class":58},"    subreddit_name ",[48,1115,121],{"class":54},[48,1117,1118],{"class":132}," '/r/'",[48,1120,148],{"class":54},[48,1122,1123],{"class":58}," file_[:",[48,1125,1126],{"class":54},"-",[48,1128,145],{"class":144},[48,1130,1131],{"class":58},"].lower()\n",[48,1133,1134],{"class":50,"line":859},[48,1135,1136],{"class":657},"    #put the name into a dictionary\n",[48,1138,1139,1142,1144,1147,1150],{"class":50,"line":865},[48,1140,1141],{"class":58},"    subreddit_name_dict ",[48,1143,121],{"class":54},[48,1145,1146],{"class":58}," {",[48,1148,1149],{"class":132},"'subreddit'",[48,1151,1152],{"class":58},":subreddit_name}\n",[48,1154,1155],{"class":50,"line":871},[48,1156,172],{"emptyLinePlaceholder":171},[48,1158,1159],{"class":50,"line":876},[48,1160,1161],{"class":657},"    #get number of subscribers\n",[48,1163,1164,1167,1169,1172,1175,1177,1179,1181,1183,1185,1187,1190],{"class":50,"line":882},[48,1165,1166],{"class":58},"    subs ",[48,1168,121],{"class":54},[48,1170,1171],{"class":58}," b.find(",[48,1173,1174],{"class":132},"'span'",[48,1176,200],{"class":58},[48,1178,204],{"class":203},[48,1180,121],{"class":54},[48,1182,209],{"class":58},[48,1184,212],{"class":132},[48,1186,215],{"class":58},[48,1188,1189],{"class":132},"'subscribers'",[48,1191,221],{"class":58},[48,1193,1194],{"class":50,"line":888},[48,1195,1196],{"class":657},"    #if the number of subscribers is displayed on the page, then we find it and add it to a dictionary\n",[48,1198,1199,1202],{"class":50,"line":893},[48,1200,1201],{"class":54},"    if",[48,1203,1204],{"class":58}," subs:\n",[48,1206,1207,1210,1212,1214,1216,1218,1220,1222,1224,1226,1228,1230,1233,1235,1237,1239,1241,1243,1245,1247,1250,1253,1256,1258,1261],{"class":50,"line":899},[48,1208,1209],{"class":58},"        subs ",[48,1211,121],{"class":54},[48,1213,1171],{"class":58},[48,1215,1174],{"class":132},[48,1217,200],{"class":58},[48,1219,204],{"class":203},[48,1221,121],{"class":54},[48,1223,209],{"class":58},[48,1225,212],{"class":132},[48,1227,215],{"class":58},[48,1229,1189],{"class":132},[48,1231,1232],{"class":58},"}).find(",[48,1234,1174],{"class":132},[48,1236,200],{"class":58},[48,1238,204],{"class":203},[48,1240,121],{"class":54},[48,1242,209],{"class":58},[48,1244,212],{"class":132},[48,1246,215],{"class":58},[48,1248,1249],{"class":132},"'number'",[48,1251,1252],{"class":58},"}).text.replace(",[48,1254,1255],{"class":132},"','",[48,1257,200],{"class":58},[48,1259,1260],{"class":132},"''",[48,1262,136],{"class":58},[48,1264,1266,1269,1271,1273,1275,1277,1280],{"class":50,"line":1265},33,[48,1267,1268],{"class":58},"        subs_dict ",[48,1270,121],{"class":54},[48,1272,1146],{"class":58},[48,1274,1189],{"class":132},[48,1276,215],{"class":58},[48,1278,1279],{"class":281},"int",[48,1281,1282],{"class":58},"(subs)}\n",[48,1284,1286],{"class":50,"line":1285},34,[48,1287,1288],{"class":657},"    #if the number of subscribers is not displayed on the page, then we set the number of subscribers in the dictionary to None\n",[48,1290,1292,1295],{"class":50,"line":1291},35,[48,1293,1294],{"class":54},"    else",[48,1296,1297],{"class":58},":\n",[48,1299,1301,1303,1305,1307,1309,1311,1314],{"class":50,"line":1300},36,[48,1302,1268],{"class":58},[48,1304,121],{"class":54},[48,1306,1146],{"class":58},[48,1308,1189],{"class":132},[48,1310,215],{"class":58},[48,1312,1313],{"class":144},"None",[48,1315,1316],{"class":58},"}\n",[48,1318,1320],{"class":50,"line":1319},37,[48,1321,172],{"emptyLinePlaceholder":171},[48,1323,1325],{"class":50,"line":1324},38,[48,1326,1327],{"class":657},"    #similar process for the description: if the description is displayed, get it and save it to desc\n",[48,1329,1331],{"class":50,"line":1330},39,[48,1332,1333],{"class":657},"    #if it is not available, then desc will be set to `None`\n",[48,1335,1337,1340,1342,1344,1346,1348,1350,1352,1354,1356,1358,1361],{"class":50,"line":1336},40,[48,1338,1339],{"class":58},"    desc ",[48,1341,121],{"class":54},[48,1343,1171],{"class":58},[48,1345,197],{"class":132},[48,1347,200],{"class":58},[48,1349,204],{"class":203},[48,1351,121],{"class":54},[48,1353,209],{"class":58},[48,1355,212],{"class":132},[48,1357,215],{"class":58},[48,1359,1360],{"class":132},"'md'",[48,1362,221],{"class":58},[48,1364,1366,1368],{"class":50,"line":1365},41,[48,1367,1201],{"class":54},[48,1369,1370],{"class":58}," desc:\n",[48,1372,1374,1377,1379,1381,1383,1385,1387,1389,1391,1393,1395,1397],{"class":50,"line":1373},42,[48,1375,1376],{"class":58},"        desc ",[48,1378,121],{"class":54},[48,1380,1171],{"class":58},[48,1382,197],{"class":132},[48,1384,200],{"class":58},[48,1386,204],{"class":203},[48,1388,121],{"class":54},[48,1390,209],{"class":58},[48,1392,212],{"class":132},[48,1394,215],{"class":58},[48,1396,1360],{"class":132},[48,1398,1399],{"class":58},"}).text\n",[48,1401,1403,1405,1407,1410,1413,1416,1418,1420,1423],{"class":50,"line":1402},43,[48,1404,1376],{"class":58},[48,1406,121],{"class":54},[48,1408,1409],{"class":58}," desc.replace(",[48,1411,1412],{"class":132},"'",[48,1414,1415],{"class":144},"\\n",[48,1417,1412],{"class":132},[48,1419,200],{"class":58},[48,1421,1422],{"class":132},"' '",[48,1424,136],{"class":58},[48,1426,1428,1431,1433,1435,1438],{"class":50,"line":1427},44,[48,1429,1430],{"class":58},"    desc_dict ",[48,1432,121],{"class":54},[48,1434,1146],{"class":58},[48,1436,1437],{"class":132},"'description'",[48,1439,1440],{"class":58},":desc}\n",[48,1442,1444],{"class":50,"line":1443},45,[48,1445,172],{"emptyLinePlaceholder":171},[48,1447,1449],{"class":50,"line":1448},46,[48,1450,1451],{"class":657},"    #here we use regular expressions to find links anywhere on the page that have the structure: \"/r/something/\"\n",[48,1453,1455,1458,1460,1462,1464,1466,1468,1470,1472,1474,1476,1478,1480,1482,1484,1486],{"class":50,"line":1454},47,[48,1456,1457],{"class":58},"    rel_subr ",[48,1459,121],{"class":54},[48,1461,232],{"class":58},[48,1463,236],{"class":235},[48,1465,239],{"class":132},[48,1467,243],{"class":242},[48,1469,236],{"class":246},[48,1471,243],{"class":242},[48,1473,251],{"class":144},[48,1475,255],{"class":254},[48,1477,258],{"class":144},[48,1479,261],{"class":54},[48,1481,243],{"class":242},[48,1483,266],{"class":54},[48,1485,239],{"class":132},[48,1487,136],{"class":58},[48,1489,1491],{"class":50,"line":1490},48,[48,1492,1493],{"class":657},"    #make a list of these links based on the \"/r/something/\" pattern\n",[48,1495,1497,1500,1502],{"class":50,"line":1496},49,[48,1498,1499],{"class":58},"    related_subreddits ",[48,1501,121],{"class":54},[48,1503,1504],{"class":58}," rel_subr.findall(html)\n",[48,1506,1508],{"class":50,"line":1507},50,[48,1509,172],{"emptyLinePlaceholder":171},[48,1511,1513],{"class":50,"line":1512},51,[48,1514,1515],{"class":657},"    #save the list to a dictionary\n",[48,1517,1519,1522,1524,1526,1529],{"class":50,"line":1518},52,[48,1520,1521],{"class":58},"    subreddits_dict ",[48,1523,121],{"class":54},[48,1525,1146],{"class":58},[48,1527,1528],{"class":132},"'related'",[48,1530,1531],{"class":58},":related_subreddits}\n",[48,1533,1535],{"class":50,"line":1534},53,[48,1536,172],{"emptyLinePlaceholder":171},[48,1538,1540],{"class":50,"line":1539},54,[48,1541,1542],{"class":657},"    #same processes for recording the date that the subreddit was created: get the date from an HTML element,\n",[48,1544,1546],{"class":50,"line":1545},55,[48,1547,1548],{"class":657},"    #then save it to a dictionary. There were two different formats available in the HTML so I grabbed both\n",[48,1550,1552,1555,1557,1559,1561,1563,1565,1567,1569,1571,1573,1576],{"class":50,"line":1551},56,[48,1553,1554],{"class":58},"    age ",[48,1556,121],{"class":54},[48,1558,1171],{"class":58},[48,1560,1174],{"class":132},[48,1562,200],{"class":58},[48,1564,204],{"class":203},[48,1566,121],{"class":54},[48,1568,209],{"class":58},[48,1570,212],{"class":132},[48,1572,215],{"class":58},[48,1574,1575],{"class":132},"'age'",[48,1577,221],{"class":58},[48,1579,1581,1583],{"class":50,"line":1580},57,[48,1582,1201],{"class":54},[48,1584,1585],{"class":58}," age:\n",[48,1587,1589,1592,1594,1597,1600,1602,1605],{"class":50,"line":1588},58,[48,1590,1591],{"class":58},"        time1 ",[48,1593,121],{"class":54},[48,1595,1596],{"class":58}," age.find(",[48,1598,1599],{"class":132},"'time'",[48,1601,809],{"class":58},[48,1603,1604],{"class":132},"'title'",[48,1606,814],{"class":58},[48,1608,1610,1613,1615,1617,1619,1621,1624],{"class":50,"line":1609},59,[48,1611,1612],{"class":58},"        time2 ",[48,1614,121],{"class":54},[48,1616,1596],{"class":58},[48,1618,1599],{"class":132},[48,1620,809],{"class":58},[48,1622,1623],{"class":132},"'datetime'",[48,1625,814],{"class":58},[48,1627,1629],{"class":50,"line":1628},60,[48,1630,172],{"emptyLinePlaceholder":171},[48,1632,1634],{"class":50,"line":1633},61,[48,1635,1636],{"class":657},"    #save the date to a dictionary\n",[48,1638,1640,1643,1645,1647,1650,1653,1656],{"class":50,"line":1639},62,[48,1641,1642],{"class":58},"    time_dict ",[48,1644,121],{"class":54},[48,1646,1146],{"class":58},[48,1648,1649],{"class":132},"\"date1\"",[48,1651,1652],{"class":58},":time1, ",[48,1654,1655],{"class":132},"\"date2\"",[48,1657,1658],{"class":58},":time2}\n",[48,1660,1662],{"class":50,"line":1661},63,[48,1663,172],{"emptyLinePlaceholder":171},[48,1665,1667],{"class":50,"line":1666},64,[48,1668,1669],{"class":657},"    #take all the dictionaries we just created and put them together into one big dictionary\n",[48,1671,1673,1676,1678,1681,1684,1686,1689,1691,1694,1696,1699,1701],{"class":50,"line":1672},65,[48,1674,1675],{"class":58},"    dictionary ",[48,1677,121],{"class":54},[48,1679,1680],{"class":281}," dict",[48,1682,1683],{"class":58},"(subs_dict.items()",[48,1685,261],{"class":54},[48,1687,1688],{"class":58},"desc_dict.items()",[48,1690,261],{"class":54},[48,1692,1693],{"class":58},"subreddits_dict.items()",[48,1695,261],{"class":54},[48,1697,1698],{"class":58},"subreddit_name_dict.items()",[48,1700,261],{"class":54},[48,1702,1703],{"class":58},"time_dict.items())\n",[48,1705,1707],{"class":50,"line":1706},66,[48,1708,172],{"emptyLinePlaceholder":171},[48,1710,1712],{"class":50,"line":1711},67,[48,1713,1714],{"class":657},"    #append the big dictionary to the list that we defined right before the beginning of the loop\n",[48,1716,1718],{"class":50,"line":1717},68,[48,1719,1720],{"class":58},"    dict_list.append(dictionary)\n",[48,1722,1724],{"class":50,"line":1723},69,[48,1725,172],{"emptyLinePlaceholder":171},[48,1727,1729],{"class":50,"line":1728},70,[48,1730,1731],{"class":657},"    #deconstruct the Beautiful Soup object (this can eat up memory very quickly, so it is very important when processing lots of data)\n",[48,1733,1735],{"class":50,"line":1734},71,[48,1736,1737],{"class":58},"    b.decompose()\n",[48,1739,1741],{"class":50,"line":1740},72,[48,1742,172],{"emptyLinePlaceholder":171},[48,1744,1746],{"class":50,"line":1745},73,[48,1747,879],{"class":657},[48,1749,1751],{"class":50,"line":1750},74,[48,1752,1753],{"class":58},"    f.close()\n",[15,1755,1756],{},"Next, let's save the results into a csv file. This let's us load the results quickly without having to scrape everyting again. To do this we can use the pandas library.",[39,1758,1760],{"className":41,"code":1759,"language":43,"meta":44,"style":44},"import pandas as pd\ndf0 = pd.DataFrame(dict_list, index=None)\n",[30,1761,1762,1774],{"__ignoreMap":44},[48,1763,1764,1766,1769,1771],{"class":50,"line":51},[48,1765,62],{"class":54},[48,1767,1768],{"class":58}," pandas ",[48,1770,92],{"class":54},[48,1772,1773],{"class":58}," pd\n",[48,1775,1776,1779,1781,1784,1787,1789,1791],{"class":50,"line":68},[48,1777,1778],{"class":58},"df0 ",[48,1780,121],{"class":54},[48,1782,1783],{"class":58}," pd.DataFrame(dict_list, ",[48,1785,1786],{"class":203},"index",[48,1788,121],{"class":54},[48,1790,1313],{"class":144},[48,1792,136],{"class":58},[15,1794,1795,1796,1799],{},"At this point, we can go through the ",[30,1797,1798],{},"related"," column in the DataFrame and put together a list of all the related subreddits. With this list, we can simply repeat the process over and over again. However, each time we start with a new list of subreddits, we want to make sure that they have not already been collected.",[15,1801,1802],{},"Next I will read in one DataFrame that represents related subreddits \"three levels deep\" relative to the default subreddits.",[15,1804,1805],{},[1806,1807,1808],"strong",{},"Default --> Related --> Related --> Related",[15,1810,1811],{},"This DataFrame represents the collection of subreddits from all of these \"layers\" of the graph.",[39,1813,1815],{"className":41,"code":1814,"language":43,"meta":44,"style":44},"import pandas as pd\nmaster_df = pd.read_pickle('pickle/master_df.p')\n",[30,1816,1817,1827],{"__ignoreMap":44},[48,1818,1819,1821,1823,1825],{"class":50,"line":51},[48,1820,62],{"class":54},[48,1822,1768],{"class":58},[48,1824,92],{"class":54},[48,1826,1773],{"class":58},[48,1828,1829,1832,1834,1837,1840],{"class":50,"line":68},[48,1830,1831],{"class":58},"master_df ",[48,1833,121],{"class":54},[48,1835,1836],{"class":58}," pd.read_pickle(",[48,1838,1839],{"class":132},"'pickle/master_df.p'",[48,1841,136],{"class":58},[15,1843,1844],{},"Now we can do a quick visualization of the growth in number of subreddits since the website's start in 2005.",[39,1846,1848],{"className":41,"code":1847,"language":43,"meta":44,"style":44},"import warnings\nwarnings.filterwarnings('ignore')\n%matplotlib inline\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\n\nmaster_df_ = master_df[master_df.notnull()]\nmaster_df_.date1 = pd.to_datetime(master_df_['date1'])\n\nlist_of_dates = master_df_.date1.sort_values()\n\ncounts = np.arange(0, len(list_of_dates))\n_ = plt.plot(list_of_dates, counts)\n_ = plt.title('Number of subreddits over time')\n_ = plt.xlabel('Date')\n_ = plt.ylabel('Cummulative Count')\nplt.savefig(os.path.expanduser('~/Documents/GitHub/briancaffey.github.io/static/subreddit_graph/subreddits_count.png'))\n",[30,1849,1850,1857,1867,1875,1887,1899,1909,1913,1923,1939,1943,1953,1957,1978,1988,2002,2016,2030],{"__ignoreMap":44},[48,1851,1852,1854],{"class":50,"line":51},[48,1853,62],{"class":54},[48,1855,1856],{"class":58}," warnings\n",[48,1858,1859,1862,1865],{"class":50,"line":68},[48,1860,1861],{"class":58},"warnings.filterwarnings(",[48,1863,1864],{"class":132},"'ignore'",[48,1866,136],{"class":58},[48,1868,1869,1872],{"class":50,"line":76},[48,1870,1871],{"class":54},"%",[48,1873,1874],{"class":58},"matplotlib inline\n",[48,1876,1877,1879,1882,1884],{"class":50,"line":84},[48,1878,62],{"class":54},[48,1880,1881],{"class":58}," matplotlib.pyplot ",[48,1883,92],{"class":54},[48,1885,1886],{"class":58}," plt\n",[48,1888,1889,1891,1894,1896],{"class":50,"line":98},[48,1890,62],{"class":54},[48,1892,1893],{"class":58}," seaborn ",[48,1895,92],{"class":54},[48,1897,1898],{"class":58}," sns\n",[48,1900,1901,1903,1905,1907],{"class":50,"line":175},[48,1902,62],{"class":54},[48,1904,89],{"class":58},[48,1906,92],{"class":54},[48,1908,95],{"class":58},[48,1910,1911],{"class":50,"line":186},[48,1912,172],{"emptyLinePlaceholder":171},[48,1914,1915,1918,1920],{"class":50,"line":224},[48,1916,1917],{"class":58},"master_df_ ",[48,1919,121],{"class":54},[48,1921,1922],{"class":58}," master_df[master_df.notnull()]\n",[48,1924,1925,1928,1930,1933,1936],{"class":50,"line":273},[48,1926,1927],{"class":58},"master_df_.date1 ",[48,1929,121],{"class":54},[48,1931,1932],{"class":58}," pd.to_datetime(master_df_[",[48,1934,1935],{"class":132},"'date1'",[48,1937,1938],{"class":58},"])\n",[48,1940,1941],{"class":50,"line":300},[48,1942,172],{"emptyLinePlaceholder":171},[48,1944,1945,1948,1950],{"class":50,"line":305},[48,1946,1947],{"class":58},"list_of_dates ",[48,1949,121],{"class":54},[48,1951,1952],{"class":58}," master_df_.date1.sort_values()\n",[48,1954,1955],{"class":50,"line":735},[48,1956,172],{"emptyLinePlaceholder":171},[48,1958,1959,1962,1964,1967,1970,1972,1975],{"class":50,"line":740},[48,1960,1961],{"class":58},"counts ",[48,1963,121],{"class":54},[48,1965,1966],{"class":58}," np.arange(",[48,1968,1969],{"class":144},"0",[48,1971,200],{"class":58},[48,1973,1974],{"class":320},"len",[48,1976,1977],{"class":58},"(list_of_dates))\n",[48,1979,1980,1983,1985],{"class":50,"line":746},[48,1981,1982],{"class":58},"_ ",[48,1984,121],{"class":54},[48,1986,1987],{"class":58}," plt.plot(list_of_dates, counts)\n",[48,1989,1990,1992,1994,1997,2000],{"class":50,"line":759},[48,1991,1982],{"class":58},[48,1993,121],{"class":54},[48,1995,1996],{"class":58}," plt.title(",[48,1998,1999],{"class":132},"'Number of subreddits over time'",[48,2001,136],{"class":58},[48,2003,2004,2006,2008,2011,2014],{"class":50,"line":764},[48,2005,1982],{"class":58},[48,2007,121],{"class":54},[48,2009,2010],{"class":58}," plt.xlabel(",[48,2012,2013],{"class":132},"'Date'",[48,2015,136],{"class":58},[48,2017,2018,2020,2022,2025,2028],{"class":50,"line":770},[48,2019,1982],{"class":58},[48,2021,121],{"class":54},[48,2023,2024],{"class":58}," plt.ylabel(",[48,2026,2027],{"class":132},"'Cummulative Count'",[48,2029,136],{"class":58},[48,2031,2032,2035,2038],{"class":50,"line":784},[48,2033,2034],{"class":58},"plt.savefig(os.path.expanduser(",[48,2036,2037],{"class":132},"'~/Documents/GitHub/briancaffey.github.io/static/subreddit_graph/subreddits_count.png'",[48,2039,669],{"class":58},[15,2041,2042],{},[2043,2044],"img",{"alt":2045,"src":2046},"png","/static/subreddit_graph/subreddits_count.png",[10,2048,2050],{"id":2049},"setting-up-a-graph-with-networkx","Setting up a graph with NetworkX",[15,2052,2053,2054,2059],{},"Next we can start to look at the collection of reddits and related subreddits as a graph. I will be using a Python package for network and graph analysis called ",[19,2055,2058],{"href":2056,"rel":2057},"https://networkx.github.io",[361],"NetworkX",".",[39,2061,2063],{"className":41,"code":2062,"language":43,"meta":44,"style":44},"#Let's make sure that we have only unique entries in the dataframe.\nmaster_df_u = master_df_.drop_duplicates('subreddit')\n",[30,2064,2065,2070],{"__ignoreMap":44},[48,2066,2067],{"class":50,"line":51},[48,2068,2069],{"class":657},"#Let's make sure that we have only unique entries in the dataframe.\n",[48,2071,2072,2075,2077,2080,2082],{"class":50,"line":68},[48,2073,2074],{"class":58},"master_df_u ",[48,2076,121],{"class":54},[48,2078,2079],{"class":58}," master_df_.drop_duplicates(",[48,2081,1149],{"class":132},[48,2083,136],{"class":58},[39,2085,2087],{"className":41,"code":2086,"language":43,"meta":44,"style":44},"master_df_u = master_df_u.drop(master_df_u.index[master_df_u.subreddit=='/r/track__subreddits_'])\n",[30,2088,2089],{"__ignoreMap":44},[48,2090,2091,2093,2095,2098,2101,2104],{"class":50,"line":51},[48,2092,2074],{"class":58},[48,2094,121],{"class":54},[48,2096,2097],{"class":58}," master_df_u.drop(master_df_u.index[master_df_u.subreddit",[48,2099,2100],{"class":54},"==",[48,2102,2103],{"class":132},"'/r/track__subreddits_'",[48,2105,1938],{"class":58},[39,2107,2109],{"className":41,"code":2108,"language":43,"meta":44,"style":44},"#here we define a dictionary where the keys are subreddits and the values are lists of related subreddits\ngraph = {x:y for x, y in zip(master_df_u.subreddit, master_df_u.related)}\n",[30,2110,2111,2116],{"__ignoreMap":44},[48,2112,2113],{"class":50,"line":51},[48,2114,2115],{"class":657},"#here we define a dictionary where the keys are subreddits and the values are lists of related subreddits\n",[48,2117,2118,2121,2123,2126,2128,2131,2133,2136],{"class":50,"line":68},[48,2119,2120],{"class":58},"graph ",[48,2122,121],{"class":54},[48,2124,2125],{"class":58}," {x:y ",[48,2127,308],{"class":54},[48,2129,2130],{"class":58}," x, y ",[48,2132,314],{"class":54},[48,2134,2135],{"class":320}," zip",[48,2137,2138],{"class":58},"(master_df_u.subreddit, master_df_u.related)}\n",[39,2140,2142],{"className":41,"code":2141,"language":43,"meta":44,"style":44},"#NetworkX comes with the python Anaconda distribution\nimport networkx as nx\n",[30,2143,2144,2149],{"__ignoreMap":44},[48,2145,2146],{"class":50,"line":51},[48,2147,2148],{"class":657},"#NetworkX comes with the python Anaconda distribution\n",[48,2150,2151,2153,2156,2158],{"class":50,"line":68},[48,2152,62],{"class":54},[48,2154,2155],{"class":58}," networkx ",[48,2157,92],{"class":54},[48,2159,2160],{"class":58}," nx\n",[39,2162,2164],{"className":41,"code":2163,"language":43,"meta":44,"style":44},"G=nx.Graph()\nG=nx.from_dict_of_lists(graph)\n#making the graph undirected takes all of the vertices between nodes and makes them bi-directional\nG1 = G.to_undirected()\n",[30,2165,2166,2176,2185,2190],{"__ignoreMap":44},[48,2167,2168,2171,2173],{"class":50,"line":51},[48,2169,2170],{"class":58},"G",[48,2172,121],{"class":54},[48,2174,2175],{"class":58},"nx.Graph()\n",[48,2177,2178,2180,2182],{"class":50,"line":68},[48,2179,2170],{"class":58},[48,2181,121],{"class":54},[48,2183,2184],{"class":58},"nx.from_dict_of_lists(graph)\n",[48,2186,2187],{"class":50,"line":76},[48,2188,2189],{"class":657},"#making the graph undirected takes all of the vertices between nodes and makes them bi-directional\n",[48,2191,2192,2195,2197],{"class":50,"line":84},[48,2193,2194],{"class":58},"G1 ",[48,2196,121],{"class":54},[48,2198,2199],{"class":58}," G.to_undirected()\n",[39,2201,2203],{"className":41,"code":2202,"language":43,"meta":44,"style":44},"choice = np.random.choice(master_df_u.subreddit, 2)\nprint choice\n",[30,2204,2205,2219],{"__ignoreMap":44},[48,2206,2207,2210,2212,2215,2217],{"class":50,"line":51},[48,2208,2209],{"class":58},"choice ",[48,2211,121],{"class":54},[48,2213,2214],{"class":58}," np.random.choice(master_df_u.subreddit, ",[48,2216,752],{"class":144},[48,2218,136],{"class":58},[48,2220,2221,2223],{"class":50,"line":68},[48,2222,321],{"class":320},[48,2224,2225],{"class":58}," choice\n",[39,2227,2230],{"className":2228,"code":2229,"language":628},[626],"['/r/streetboarding' '/r/stephenking']\n",[30,2231,2229],{"__ignoreMap":44},[15,2233,2234],{},"Let's test out some of the functions from NetworkX for graph analysis. First, let's take the two randomly selected nodes defined above and test to see if there exists a path between them:",[39,2236,2238],{"className":41,"code":2237,"language":43,"meta":44,"style":44},"nx.has_path(G1, choice[0], choice[1])\n",[30,2239,2240],{"__ignoreMap":44},[48,2241,2242,2245,2247,2250,2253],{"class":50,"line":51},[48,2243,2244],{"class":58},"nx.has_path(G1, choice[",[48,2246,1969],{"class":144},[48,2248,2249],{"class":58},"], choice[",[48,2251,2252],{"class":144},"1",[48,2254,1938],{"class":58},[39,2256,2259],{"className":2257,"code":2258,"language":628},[626],"True\n",[30,2260,2258],{"__ignoreMap":44},[10,2262,2264],{"id":2263},"shortest-path","Shortest path",[15,2266,2267],{},"Now let's see (at least one of) the shortest path that exists between these nodes:",[39,2269,2271],{"className":41,"code":2270,"language":43,"meta":44,"style":44},"nx.shortest_path(G1, choice[0], choice[1])\n",[30,2272,2273],{"__ignoreMap":44},[48,2274,2275,2278,2280,2282,2284],{"class":50,"line":51},[48,2276,2277],{"class":58},"nx.shortest_path(G1, choice[",[48,2279,1969],{"class":144},[48,2281,2249],{"class":58},[48,2283,2252],{"class":144},[48,2285,1938],{"class":58},[39,2287,2290],{"className":2288,"code":2289,"language":628},[626],"['/r/streetboarding',\n '/r/freebord',\n '/r/adrenaline',\n '/r/imaginaryadrenaline',\n '/r/imaginarystephenking',\n '/r/stephenking']\n",[30,2291,2289],{"__ignoreMap":44},[15,2293,2294],{},"Let's write a function that selects two random subreddits and then prints a shortest path if it exists:",[39,2296,2298],{"className":41,"code":2297,"language":43,"meta":44,"style":44},"def short_path():\n    choices = np.random.choice(master_df_u.subreddit, 2)\n    if nx.has_path(G1, choices[0], choices[1]) == True:\n        path = nx.shortest_path(G1, choices[0], choices[1])\n        print choices[0] + ' and ' + choices[1] + ' are joined by: \\n' + str(path)\n    else:\n        print \"No path exists between \" + choices[0] + ' and ' + choices[1]\n",[30,2299,2300,2312,2325,2349,2367,2409,2415],{"__ignoreMap":44},[48,2301,2302,2305,2309],{"class":50,"line":51},[48,2303,2304],{"class":235},"def",[48,2306,2308],{"class":2307},"srTi1"," short_path",[48,2310,2311],{"class":58},"():\n",[48,2313,2314,2317,2319,2321,2323],{"class":50,"line":68},[48,2315,2316],{"class":58},"    choices ",[48,2318,121],{"class":54},[48,2320,2214],{"class":58},[48,2322,752],{"class":144},[48,2324,136],{"class":58},[48,2326,2327,2329,2332,2334,2337,2339,2342,2344,2347],{"class":50,"line":76},[48,2328,1201],{"class":54},[48,2330,2331],{"class":58}," nx.has_path(G1, choices[",[48,2333,1969],{"class":144},[48,2335,2336],{"class":58},"], choices[",[48,2338,2252],{"class":144},[48,2340,2341],{"class":58},"]) ",[48,2343,2100],{"class":54},[48,2345,2346],{"class":144}," True",[48,2348,1297],{"class":58},[48,2350,2351,2354,2356,2359,2361,2363,2365],{"class":50,"line":84},[48,2352,2353],{"class":58},"        path ",[48,2355,121],{"class":54},[48,2357,2358],{"class":58}," nx.shortest_path(G1, choices[",[48,2360,1969],{"class":144},[48,2362,2336],{"class":58},[48,2364,2252],{"class":144},[48,2366,1938],{"class":58},[48,2368,2369,2372,2375,2377,2380,2382,2385,2387,2389,2391,2393,2395,2398,2400,2402,2404,2406],{"class":50,"line":98},[48,2370,2371],{"class":320},"        print",[48,2373,2374],{"class":58}," choices[",[48,2376,1969],{"class":144},[48,2378,2379],{"class":58},"] ",[48,2381,261],{"class":54},[48,2383,2384],{"class":132}," ' and '",[48,2386,148],{"class":54},[48,2388,2374],{"class":58},[48,2390,2252],{"class":144},[48,2392,2379],{"class":58},[48,2394,261],{"class":54},[48,2396,2397],{"class":132}," ' are joined by: ",[48,2399,1415],{"class":144},[48,2401,1412],{"class":132},[48,2403,148],{"class":54},[48,2405,905],{"class":281},[48,2407,2408],{"class":58},"(path)\n",[48,2410,2411,2413],{"class":50,"line":175},[48,2412,1294],{"class":54},[48,2414,1297],{"class":58},[48,2416,2417,2419,2422,2424,2426,2428,2430,2432,2434,2436,2438,2440],{"class":50,"line":186},[48,2418,2371],{"class":320},[48,2420,2421],{"class":132}," \"No path exists between \"",[48,2423,148],{"class":54},[48,2425,2374],{"class":58},[48,2427,1969],{"class":144},[48,2429,2379],{"class":58},[48,2431,261],{"class":54},[48,2433,2384],{"class":132},[48,2435,148],{"class":54},[48,2437,2374],{"class":58},[48,2439,2252],{"class":144},[48,2441,814],{"class":58},[15,2443,2444,2445,2448],{},"Here's a collection of results from the ",[30,2446,2447],{},"short_path"," function defined above that start to paint a picuture of the broad set of topics covered by reddit.com:",[39,2450,2452],{"className":41,"code":2451,"language":43,"meta":44,"style":44},"short_path()\n",[30,2453,2454],{"__ignoreMap":44},[48,2455,2456],{"class":50,"line":51},[48,2457,2451],{"class":58},[39,2459,2462],{"className":2460,"code":2461,"language":628},[626],"/r/personalizationadvice and /r/beautifulfemales are joined by:\n['/r/personalizationadvice', '/r/coloranalysis', '/r/fashion', '/r/redcarpet', '/r/gentlemanboners', '/r/beautifulfemales']\n",[30,2463,2461],{"__ignoreMap":44},[39,2465,2466],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2467,2468],{"__ignoreMap":44},[48,2469,2470],{"class":50,"line":51},[48,2471,2451],{"class":58},[39,2473,2476],{"className":2474,"code":2475,"language":628},[626],"/r/caffeine and /r/shittyramen are joined by:\n['/r/caffeine', '/r/toast', '/r/cooking', '/r/ramen', '/r/shittyramen']\n",[30,2477,2475],{"__ignoreMap":44},[39,2479,2480],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2481,2482],{"__ignoreMap":44},[48,2483,2484],{"class":50,"line":51},[48,2485,2451],{"class":58},[39,2487,2490],{"className":2488,"code":2489,"language":628},[626],"/r/watchingcongress and /r/iwantthatonashirt are joined by:\n['/r/watchingcongress', '/r/stand', '/r/snowden', '/r/undelete', '/r/trees', '/r/iwantthatonashirt']\n",[30,2491,2489],{"__ignoreMap":44},[39,2493,2494],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2495,2496],{"__ignoreMap":44},[48,2497,2498],{"class":50,"line":51},[48,2499,2451],{"class":58},[39,2501,2504],{"className":2502,"code":2503,"language":628},[626],"/r/asksciencediscussion and /r/dogsonhardwoodfloors are joined by:\n['/r/asksciencediscussion', '/r/badscience', '/r/badlinguistics', '/r/animalsbeingjerks', '/r/startledcats', '/r/dogsonhardwoodfloors']\n",[30,2505,2503],{"__ignoreMap":44},[39,2507,2508],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2509,2510],{"__ignoreMap":44},[48,2511,2512],{"class":50,"line":51},[48,2513,2451],{"class":58},[39,2515,2518],{"className":2516,"code":2517,"language":628},[626],"/r/randommail and /r/mini are joined by:\n['/r/randommail', '/r/spiceexchange', '/r/cameraswapping', '/r/itookapicture', '/r/carporn', '/r/mini']\n",[30,2519,2517],{"__ignoreMap":44},[39,2521,2522],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2523,2524],{"__ignoreMap":44},[48,2525,2526],{"class":50,"line":51},[48,2527,2451],{"class":58},[39,2529,2532],{"className":2530,"code":2531,"language":628},[626],"/r/catsinsinks and /r/nzmovies are joined by:\n['/r/catsinsinks', '/r/wetcats', '/r/tinysubredditoftheday', '/r/sheep', '/r/nzmetahub', '/r/nzmovies']\n",[30,2533,2531],{"__ignoreMap":44},[39,2535,2536],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2537,2538],{"__ignoreMap":44},[48,2539,2540],{"class":50,"line":51},[48,2541,2451],{"class":58},[39,2543,2546],{"className":2544,"code":2545,"language":628},[626],"/r/thoriumreactor and /r/sailing are joined by:\n['/r/thoriumreactor', '/r/energy', '/r/spev', '/r/sailing']\n",[30,2547,2545],{"__ignoreMap":44},[39,2549,2550],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2551,2552],{"__ignoreMap":44},[48,2553,2554],{"class":50,"line":51},[48,2555,2451],{"class":58},[39,2557,2560],{"className":2558,"code":2559,"language":628},[626],"/r/deathnote and /r/vegetarianism are joined by:\n['/r/deathnote', '/r/television', '/r/netflixbestof', '/r/naturefilms', '/r/environment', '/r/vegetarianism']\n",[30,2561,2559],{"__ignoreMap":44},[39,2563,2564],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2565,2566],{"__ignoreMap":44},[48,2567,2568],{"class":50,"line":51},[48,2569,2451],{"class":58},[39,2571,2574],{"className":2572,"code":2573,"language":628},[626],"/r/mississippir4r and /r/mathematics are joined by:\n['/r/mississippir4r', '/r/mississippi', '/r/prisonreform', '/r/socialscience', '/r/alltech', '/r/mathematics']\n",[30,2575,2573],{"__ignoreMap":44},[39,2577,2578],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2579,2580],{"__ignoreMap":44},[48,2581,2582],{"class":50,"line":51},[48,2583,2451],{"class":58},[39,2585,2588],{"className":2586,"code":2587,"language":628},[626],"/r/britainsgottalent and /r/irelandbaldwin are joined by:\n['/r/britainsgottalent', '/r/britishtv', '/r/that70sshow', '/r/mila_kunis', '/r/christinaricci', '/r/irelandbaldwin']\n",[30,2589,2587],{"__ignoreMap":44},[39,2591,2592],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2593,2594],{"__ignoreMap":44},[48,2595,2596],{"class":50,"line":51},[48,2597,2451],{"class":58},[39,2599,2602],{"className":2600,"code":2601,"language":628},[626],"/r/the_donald and /r/ladybusiness are joined by:\n['/r/the_donald', '/r/shitliberalssay', '/r/trollxchromosomes', '/r/ladybusiness']\n",[30,2603,2601],{"__ignoreMap":44},[39,2605,2606],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2607,2608],{"__ignoreMap":44},[48,2609,2610],{"class":50,"line":51},[48,2611,2451],{"class":58},[39,2613,2616],{"className":2614,"code":2615,"language":628},[626],"/r/selfharm and /r/medlabprofessionals are joined by:\n['/r/selfharm', '/r/adhd', '/r/neuroimaging', '/r/pharmacy', '/r/medlabprofessionals']\n",[30,2617,2615],{"__ignoreMap":44},[39,2619,2620],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2621,2622],{"__ignoreMap":44},[48,2623,2624],{"class":50,"line":51},[48,2625,2451],{"class":58},[39,2627,2630],{"className":2628,"code":2629,"language":628},[626],"/r/coverart and /r/phillycraftbeer are joined by:\n['/r/coverart', '/r/nostalgia', '/r/upvotedbecausegirl', '/r/wtf', '/r/remindsmeofdf', '/r/beer', '/r/phillycraftbeer']\n",[30,2631,2629],{"__ignoreMap":44},[39,2633,2634],{"className":41,"code":2451,"language":43,"meta":44,"style":44},[30,2635,2636],{"__ignoreMap":44},[48,2637,2638],{"class":50,"line":51},[48,2639,2451],{"class":58},[39,2641,2644],{"className":2642,"code":2643,"language":628},[626],"/r/hotguyswithlonghair and /r/castles are joined by:\n['/r/hotguyswithlonghair', '/r/majesticmanes', '/r/ladyboners', '/r/imaginaryladyboners', '/r/imaginarycastles', '/r/castles']\n",[30,2645,2643],{"__ignoreMap":44},[15,2647,2648,2649,2654,2655,2660],{},"Taking a look ",[19,2650,2653],{"href":2651,"rel":2652},"http://networkx.readthedocs.io/en/networkx-1.11/_modules/networkx/algorithms/shortest_paths/unweighted.html?highlight=bidirectional_shortest_path",[361],"under the hood"," of NetworkX and examining the algorith that finds the ",[19,2656,2659],{"href":2657,"rel":2658},"http://networkx.readthedocs.io/en/networkx-1.11/_modules/networkx/algorithms/shortest_paths/generic.html#shortest_path",[361],"shortest path"," between any two nodes in a graph, we find that it simply boils down to:",[39,2662,2665],{"className":2663,"code":2664,"language":628},[626],"def shortest_path(G, source=None, target=None, weight=None):\n    paths=nx.bidirectional_shortest_path(G,source,target)\n    return paths\n",[30,2666,2664],{"__ignoreMap":44},[15,2668,2669,2670,2673,2674,2678],{},"You can read more about the ",[30,2671,2672],{},"bidirectional_shortest_path"," function ",[19,2675,2677],{"href":2651,"rel":2676},[361],"here"," in the NetworkX documentation.",[15,2680,2681,2682,2685],{},"When I was first experimenting with graph algorithms, I had an interesting result using an algorithm intruduced ",[19,2683,2677],{"href":615,"rel":2684},[361]," in the Python documentation. Here's the algorithm:",[39,2687,2689],{"className":41,"code":2688,"language":43,"meta":44,"style":44},"def find_path(graph, start, end, path=[]):\n    path = path + [start]\n    if start == end:\n        return path\n    if not graph.has_key(start):\n        return None\n    for node in graph[start]:\n        if node not in path:\n            newpath = find_path(graph, node, end, path)\n            if newpath: return newpath\n    return None\n",[30,2690,2691,2724,2739,2751,2759,2769,2776,2789,2805,2815,2829],{"__ignoreMap":44},[48,2692,2693,2695,2698,2700,2704,2706,2709,2711,2714,2716,2719,2721],{"class":50,"line":51},[48,2694,2304],{"class":235},[48,2696,2697],{"class":2307}," find_path",[48,2699,285],{"class":58},[48,2701,2703],{"class":2702},"so59x","graph",[48,2705,200],{"class":58},[48,2707,2708],{"class":2702},"start",[48,2710,200],{"class":58},[48,2712,2713],{"class":2702},"end",[48,2715,200],{"class":58},[48,2717,2718],{"class":2702},"path",[48,2720,121],{"class":54},[48,2722,2723],{"class":58},"[]):\n",[48,2725,2726,2729,2731,2734,2736],{"class":50,"line":68},[48,2727,2728],{"class":58},"    path ",[48,2730,121],{"class":54},[48,2732,2733],{"class":58}," path ",[48,2735,261],{"class":54},[48,2737,2738],{"class":58}," [start]\n",[48,2740,2741,2743,2746,2748],{"class":50,"line":76},[48,2742,1201],{"class":54},[48,2744,2745],{"class":58}," start ",[48,2747,2100],{"class":54},[48,2749,2750],{"class":58}," end:\n",[48,2752,2753,2756],{"class":50,"line":84},[48,2754,2755],{"class":54},"        return",[48,2757,2758],{"class":58}," path\n",[48,2760,2761,2763,2766],{"class":50,"line":98},[48,2762,1201],{"class":54},[48,2764,2765],{"class":54}," not",[48,2767,2768],{"class":58}," graph.has_key(start):\n",[48,2770,2771,2773],{"class":50,"line":175},[48,2772,2755],{"class":54},[48,2774,2775],{"class":144}," None\n",[48,2777,2778,2781,2784,2786],{"class":50,"line":186},[48,2779,2780],{"class":54},"    for",[48,2782,2783],{"class":58}," node ",[48,2785,314],{"class":54},[48,2787,2788],{"class":58}," graph[start]:\n",[48,2790,2791,2794,2796,2799,2802],{"class":50,"line":224},[48,2792,2793],{"class":54},"        if",[48,2795,2783],{"class":58},[48,2797,2798],{"class":54},"not",[48,2800,2801],{"class":54}," in",[48,2803,2804],{"class":58}," path:\n",[48,2806,2807,2810,2812],{"class":50,"line":273},[48,2808,2809],{"class":58},"            newpath ",[48,2811,121],{"class":54},[48,2813,2814],{"class":58}," find_path(graph, node, end, path)\n",[48,2816,2817,2820,2823,2826],{"class":50,"line":300},[48,2818,2819],{"class":54},"            if",[48,2821,2822],{"class":58}," newpath: ",[48,2824,2825],{"class":54},"return",[48,2827,2828],{"class":58}," newpath\n",[48,2830,2831,2834],{"class":50,"line":305},[48,2832,2833],{"class":54},"    return",[48,2835,2775],{"class":144},[15,2837,2838],{},"The above algorthim uses a process called backtracking to exaustively try all possibilities until it returns a solution. It creates an interesting \"random walk\" through groups of related subreddits. Here's the result of calling the above function on our graph (only 2 layers deep) with two random nodes: /r/persianrap and /r/nosleep:",[353,2840,2841],{},[15,2842,2843],{},"/r/persianrap /r/middleeasternmusic /r/arabic /r/arabs /r/libyancrisis /r/syriancivilwar /r/yemenicrisis /r/sinaiinsurgency /r/jihadinfocus /r/credibledefense /r/geopolitics /r/forgottennews /r/libyanconflict /r/menaconflicts /r/iran /r/iranianlgbt /r/zoroastrianism /r/kurdistan /r/rojava /r/anarchism /r/imaginarypolitics /r/imaginaryimmortals /r/imaginaryclerics /r/imaginarylakes /r/imaginaryaliens /r/imaginarygnomes /r/imaginaryladyboners /r/imaginaryturtleworlds /r/imaginarysunnydale /r/imaginarydwarves /r/imaginarywizards /r/imaginaryvikings /r/imaginarycolorscapes /r/imaginarysteampunk /r/imaginarytemples /r/imaginaryblueprints /r/comicbookart /r/imaginarytechnology /r/mtgporn /r/imaginaryoldkingdom /r/imaginaryfactories /r/imaginaryfederation /r/imaginarylovers /r/imaginarynarnia /r/imaginarydwellings /r/imaginaryscience /r/imaginarytaverns /r/imaginarybattlefields /r/cityporn /r/japanpics /r/nationalphotosubs /r/austriapics /r/southkoreapics /r/taiwanpics /r/ghanapics /r/kenyapics /r/norwaypics /r/vzlapics /r/perupics /r/antarcticapics /r/greatlakespics /r/lakeporn /r/pornoverlords /r/thingscutinhalfporn /r/manufacturing /r/cnc /r/askengineers /r/sciencesubreddits /r/math /r/simulate /r/cosmology /r/reddittothefuture /r/scifi /r/lost /r/the100books /r/the100 /r/theblacklist /r/nbc /r/dundermifflin /r/sonsofanarchy /r/twentyfour /r/banshee /r/hbo /r/siliconvalleyhbo /r/siliconvalley /r/california /r/tahoe /r/skiing /r/snowshoeing /r/xcountryskiing /r/wintergear /r/skijumping /r/winter /r/bigmountain /r/mountaineering /r/campingandhiking /r/earthporn /r/nature /r/birding /r/invasivespecies /r/zoology /r/entomology /r/rainforest /r/botany /r/wildlife /r/allscience /r/earthscience /r/energy /r/biomass /r/renewablenews /r/syngas /r/climatenews /r/composting /r/vermiculture /r/organicfarming /r/livestock /r/animalwelfare /r/randomactsofpetfood /r/animalreddits /r/cockatiel /r/catpics /r/tortoises /r/whales /r/cetacea /r/lifeaquatic /r/hrw /r/green_peace /r/environmental_policy /r/conservation /r/depthhub /r/indepthsports /r/deeperhubbeta /r/lectures /r/spacepolicy /r/skylon /r/ula /r/isro /r/engineteststands /r/jupiters /r/imaginarystarscapes /r/spacequestions /r/spaceflight /r/moon /r/dione /r/europa /r/oortcloud /r/dwarfplanetceres /r/saturn /r/asteroidbelt /r/mars /r/rhea /r/venus /r/astrophys /r/spacevideos /r/transhuman /r/timereddits /r/virtualreality /r/vive /r/oculus /r/learnvrdev /r/unity3d /r/gamedev /r/crowdfunding /r/crowdsourcing /r/mturk /r/swagbucks /r/beermoney /r/flipping /r/shoplifting /r/thriftstorehauls /r/dvdcollection /r/televisionposterporn /r/concertposterporn /r/movieposterporn /r/lv426 /r/predator /r/arnoldschwarzenegger /r/alanpartridge /r/americandad /r/timanderic /r/homemovies /r/gravityfalls /r/homestarrunner /r/telltale /r/thewalkingdeadgame /r/thewalkingdeadgifs /r/twdnomansland /r/heycarl /r/twdroadtosurvival /r/thewalkingdead /r/zombies /r/guns /r/swissguns /r/opencarry /r/libertarian /r/geolibertarianism /r/basicincome /r/basicincomeactivism /r/mhoc /r/modelaustralia /r/rmtk /r/thenetherlands /r/tokkiefeesboek /r/nujijinactie /r/ik_ihe /r/youirl /r/fite_me_irl /r/2meirl4meirl /r/depression /r/randomactsofcards /r/philately /r/coins /r/coins4sale /r/ancientcoins /r/ancientrome /r/flatblue /r/bestofwritingprompts /r/writingprompts /r/promptoftheday /r/flashfiction /r/keepwriting /r/getmotivated /r/mentors /r/favors /r/recordthis /r/videography /r/animation /r/3dsmax /r/computergraphics /r/cinema4d /r/design /r/ui_design /r/designjobs /r/heavymind /r/wtfart /r/alternativeart /r/imaginaryninjas /r/imaginaryruins /r/isometric /r/imaginaryislands /r/imaginaryverse /r/icandrawthat /r/caricatures /r/imaginaryneweden /r/imaginaryequestria /r/imaginaryaww /r/imaginarycyberpunk /r/chinafuturism /r/scifirealism /r/inegentlemanboners /r/imaginarywtf /r/imaginaryelementals /r/imaginarydinosaurs /r/dinosaurs /r/speculativeevolution /r/hybridanimals /r/photoshopbattles /r/cutouts /r/battleshops /r/graphic_design /r/visualization /r/statistics /r/oncourtanalytics /r/nbaanalytics /r/nba /r/pacers /r/atlantahawks /r/basketball /r/mavericks /r/fcdallas /r/theticket /r/dallasstars /r/bostonbruins /r/patriots /r/tennesseetitans /r/nashvillesounds /r/predators /r/flyers /r/hockeyfandom /r/caps /r/nhl /r/detroitredwings /r/sabres /r/floridapanthers /r/habs /r/montrealimpact /r/alouettes /r/cfl /r/stadiumporn /r/nfl /r/madden /r/eurobowl /r/fantasyfb /r/fantasyfootball /r/49ers /r/footballgamefilm /r/footballstrategy /r/cfb /r/collegebaseball /r/mlbdraft /r/baseball /r/cubs /r/cardinals /r/saintlouisfc /r/stlouisblues /r/stlouis /r/stlouisbiking /r/mobicycling /r/bicycling /r/vintage_bicycles /r/miamibiking /r/fatbike /r/cycling /r/strava /r/phillycycling /r/wheelbuild /r/bikewrench /r/velo /r/bikepolo /r/bicycletouring /r/bicyclingcirclejerk /r/bikecommuting /r/ukbike /r/leedscycling /r/londoncycling /r/fixedgearbicycle /r/cyclingfashion /r/peloton /r/mtb /r/climbingporn /r/adrenaline /r/motocross /r/bmxracing /r/wake /r/snowboardingnoobs /r/freebord /r/snowboarding /r/sledding /r/outdoors /r/soposts /r/cordcutters /r/netflixviavpn /r/hulu /r/firetv /r/netflixbestof /r/raisinghope /r/madmen /r/earthsgottalent /r/bobsburgers /r/fringe /r/louie /r/theoriginals /r/iansomerhalder /r/kat_graham /r/indianaevans /r/janelevy /r/gagegolightly /r/sarahhyland /r/starlets /r/ninadobrev /r/kathrynnewton /r/arielwinter /r/ashleygreene /r/gentlemanboners /r/bandporn /r/musicpics /r/listentomusic /r/listentonew /r/subraddits /r/dtipics /r/damnthatsinteresting /r/interestingasfuck /r/unexpected /r/wtf /r/weird /r/animalsbeingderps /r/animalsbeingconfused /r/humansbeingbros /r/hulpdiensten /r/askle /r/protectandserve /r/good_cop_free_donut /r/bad_cop_follow_up /r/amifreetogo /r/copwatch /r/puppycide /r/underreportednews /r/mediaquotes /r/savedyouaclick /r/news /r/neutralnews /r/ask_politics /r/politicalopinions /r/gunsarecool /r/renewableenergy /r/web_design /r/somebodymakethis /r/somethingimade /r/crafts /r/kidscrafts /r/daddit /r/formulafeeders /r/boobsandbottles /r/csectioncentral /r/predaddit /r/dadbloggers /r/mombloggers /r/cutekids /r/bigfeats /r/scienceparents /r/lv9hrvv /r/sahp /r/tryingforababy /r/waiting_to_try /r/pcos /r/infertility /r/birthparents /r/tfabchartstalkers /r/firsttimettc /r/cautiousbtb /r/ttchealthy /r/xxketo /r/ketoscience /r/ketogains /r/leangains /r/gettingshredded /r/bulkorcut /r/gainit /r/decidingtobebetter /r/zen /r/buddhism /r/astralprojection /r/spirituality /r/hinduism /r/yoga /r/veganfitness /r/posture /r/health /r/ukhealthcare /r/pharmacy /r/nursing /r/doctorswithoutborders /r/humanitarian /r/assistance /r/paranormalhelp /r/paranormal /r/333 /r/askparanormal /r/intelligence /r/blackhat /r/netsec /r/technology /r/newyorkfuturistparty /r/rad_decentralization /r/massachusettsfp /r/opensource /r/alabamafp /r/darknetplan /r/torrents /r/i2p /r/privacy /r/badgovnofreedom /r/censorship /r/governmentoppression /r/descentintotyranny /r/wikileaks /r/dncleaks /r/hillaryforprison /r/the_donald /r/shitredditsays /r/srsmythos /r/srstrees /r/entwives /r/lesbients /r/actuallesbians /r/lesbianromance /r/lesbianerotica /r/l4l /r/dyke /r/ladyladyboners /r/bisexual /r/bisexy /r/biwomen /r/pansexual /r/genderqueer /r/transspace /r/lgbtlibrary /r/lgbtnews /r/dixiequeer /r/lgbt /r/sex /r/helpmecope /r/bpd /r/rapecounseling /r/trueoffmychest /r/suicidewatch /r/bipolarsos /r/bipolar /r/mentalpod /r/adhd /r/hoarding /r/declutter /r/thrifty /r/tinyhouses /r/leanfire /r/lowcar /r/zerowaste /r/simpleliving /r/livingofftheland /r/hunting /r/animaltracking /r/survival /r/vedc /r/4x4 /r/classiccars /r/automotivetraining /r/autodetailing /r/cartalk /r/mercedes_benz /r/motorsports /r/rallycross /r/worldrallycross /r/blancpain /r/nascarhometracks /r/arcaracing /r/stadiumsupertrucks /r/hydroplanes /r/sailing /r/boatbuilding /r/woodworking /r/cottage_industry /r/farriers /r/blacksmith /r/bladesmith /r/knives /r/swissarmyknives /r/switzerland /r/bern /r/sanktgallen /r/liechtenstein /r/erasmus /r/de /r/germanpuns /r/schland /r/rvacka /r/sloensko /r/slovakia /r/belarus /r/andorra /r/europe /r/hungary /r/francophonie /r/thailand /r/vietnam /r/vietnampics /r/travel /r/geography /r/climate /r/drought /r/waterutilities /r/drylands /r/irrigation /r/water /r/onthewaterfront /r/wetlands /r/marinelife /r/ocean /r/seasteading /r/frontier_colonization /r/arcology /r/retrofuturism /r/goldenpath /r/politics /r/moderationtheory /r/wdp /r/outoftheloop /r/wherearetheynow /r/entertainment /r/portlandia /r/themichaeljfoxshow /r/backtothefuture /r/bladerunner /r/filmnoir /r/vintageladyboners /r/classicfilms /r/foreignmovies /r/britishfilms /r/canadianfilm /r/newjerseyfilm /r/newzealandfilm /r/newzealand /r/wellington /r/nzmetahub /r/newzealandhistory /r/scottishhistory /r/scots /r/scottishproblems /r/britishproblems /r/swedishproblems /r/pinsamt /r/sweden /r/svenskpolitik /r/arbetarrorelsen /r/socialism /r/shittydebatecommunism /r/shittysocialscience /r/shittyideasforadmins /r/shittytheoryofreddit /r/shittybuildingporn /r/shittylifeprotips /r/shittyshitredditsays /r/shittyquotesporn /r/shittyama /r/askashittyparent /r/shittyprogramming /r/shittyaskalawyer /r/badlegaladvice /r/badscience /r/badeconomics /r/badhistory /r/historicalrage /r/metarage /r/ragenovels /r/fffffffuuuuuuuuuuuu /r/gaaaaaaayyyyyyyyyyyy /r/lgbteens /r/needafriend /r/rant /r/showerthoughts /r/markmywords /r/calledit /r/futurewhatif /r/sportswhatif /r/alternatehistory /r/maps /r/xkcd /r/kerbalspaceprogram /r/spacesimgames /r/eve /r/scifigaming /r/masseffect /r/imaginarymasseffect /r/imaginaryvampires /r/imaginarytowers /r/imaginarybestof /r/pics /r/spaceporn /r/auroraporn /r/weatherporn /r/sfwpornnetwork /r/fwepp /r/shittyearthporn /r/shittyaskreddit /r/askashittyphilosopher /r/shittyaskhistory /r/shittysuboftheweek /r/shittyaskcooking /r/shittyhub /r/coolguides /r/trendingsubreddits /r/monkslookingatbeer /r/beerporn /r/beerwithaview /r/shittybeerwithaview /r/shittyfoodporn /r/enttreats /r/trees /r/eldertrees /r/vaporents /r/crainn /r/eirhub /r/fairepublicofireland /r/gaeltacht /r/westmeath /r/tipperary /r/limerick /r/kilkenny /r/ireland /r/irejobs /r/resumes /r/careerguidance /r/flatone /r/centralillinois /r/chicubs /r/whitesox /r/minnesotatwins /r/minnesotavikings /r/greenbaypackers /r/jaguars /r/miamidolphins /r/nflroundtable /r/detroitlions /r/forhonor /r/vikingstv /r/hannibaltv /r/thepathhulu /r/batesmotel /r/hannibal /r/hitchcock /r/silentmoviegifs /r/moviestunts /r/bollywoodrealism /r/indiamain /r/indianews /r/asia /r/oldindia /r/explorepakistan /r/churchporn /r/medievalporn /r/castles /r/historyporn /r/thewaywewere /r/1970s /r/classicmovietrailers /r/warmovies /r/moviecritic /r/trailers /r/liveaction /r/animedeals /r/dbz /r/toonami /r/regularshow /r/thelifeandtimesoftim /r/aquajail /r/modern_family /r/supernatural /r/mishacollins /r/jaredpadalecki /r/fandomnatural /r/fangirls /r/trollxgirlgamers /r/trollmedia /r/trollgaming /r/trollmua /r/justtrollxthings /r/trollxmoms /r/trollmeta /r/trollychromosome /r/oney /r/askwomen /r/okcupid /r/relationship_advice /r/help /r/bugs /r/redditdev /r/enhancement /r/yoursub /r/horrorreviewed /r/truecreepy /r/metatruereddit /r/truepolitics /r/truehub /r/truegaming /r/askgames /r/freegamesonandroid /r/androidapps /r/apphookup /r/browsemyreddit /r/findareddit /r/trap /r/naut /r/militaryfinance /r/army /r/militarystories /r/nationalguard /r/uscg /r/usa /r/murica /r/lonestar /r/whataburger /r/fastfood /r/cocacola /r/kelloggs /r/kellawwggs /r/awwducational /r/marinebiologygifs /r/biologygifs /r/chemicalreactiongifs /r/homechemistry /r/holdmybeaker /r/holdmybeer /r/movieoftheday /r/sharknado /r/syfy /r/killjoys /r/theexpanse /r/truedetective /r/boardwalkempire /r/mobcast /r/1920s /r/1960s /r/beatles /r/minimaluminiumalism /r/ghostsrights /r/botsrights /r/totallynotrobots /r/robotics /r/manna /r/singularity /r/futureporn /r/singularitarianism /r/automate /r/darkfuturology /r/controlproblem /r/aiethics /r/ainothuman /r/neuraljokes /r/3amjokes /r/mommajokes /r/antijokes /r/absolutelynotme_irl /r/toomeirlformeirl /r/meirl /r/tree_irl /r/fishpost /r/mod_irl /r/pics_irl /r/teleshits /r/bitstrips /r/stopbullyingcomics /r/animalsbeingjerks /r/surfinganimals /r/unorthocat /r/catsubs /r/stuffoncats /r/catsinbusinessattire /r/catsinsinks /r/catsonkeyboards /r/mechanicalkeyboards /r/hackedgadgets /r/techsupportmacgyver /r/techsupport /r/programming /r/algorithms /r/datamining /r/datasets /r/wordcloud /r/datavizrequests /r/funnycharts /r/mapporn /r/mapmaking /r/worldbuilding /r/scificoncepts /r/apocalypseporn /r/imaginaryjerk /r/braveryjerk /r/circlejerk /r/politicaldiscussion /r/politicalfactchecking /r/moderatepolitics /r/truereddit /r/malelifestyle /r/fitness /r/swimming /r/freediving /r/bikeshop /r/climbing /r/climbharder /r/bouldering /r/climbergirls /r/womenshredders /r/skatergirls /r/girlsurfers /r/kiteboarding /r/longboarding /r/streetboarding /r/letsgosnowboarding /r/spliddit /r/backcountry /r/wjdbbl2 /r/caving /r/nationalparks /r/parkrangers /r/thesca /r/searchandrescue /r/wildernessbackpacking /r/campinggear /r/flashlight /r/camping /r/yellowstone /r/wmnf /r/pacificcresttrail /r/cdt /r/ultralight /r/backpacking /r/travelpartners /r/adventures /r/libraryofshadows /r/shortscarystories /r/shortscarystoriesooc /r/nosleepooc /r/nosleep",[10,2845,2847],{"id":2846},"centrality","Centrality",[15,2849,2850,2851,215],{},"Centrality is anohter important topic in graph theory. Here's a brief introduction to centrality from ",[19,2852,2855],{"href":2853,"rel":2854},"https://en.wikipedia.org/wiki/Centrality",[361],"Wikipedia",[353,2857,2858],{},[15,2859,2860],{},"In graph theory and network analysis, indicators of centrality identify the most important vertices within a graph. Applications include identifying the most influential person(s) in a social network, key infrastructure nodes in the Internet or urban networks, and super-spreaders of disease.",[15,2862,2863,2864,2867],{},"There are several different methods of measuring centrality in a graph. Here I use ",[30,2865,2866],{},"eigenvector_centrality_numpy",", a function included in NetworkX. It takes in a graph and returns a dictionary with graph nodes as keys and node centrality as values.",[39,2869,2871],{"className":41,"code":2870,"language":43,"meta":44,"style":44},"centrality = nx.eigenvector_centrality_numpy(G1)\n",[30,2872,2873],{"__ignoreMap":44},[48,2874,2875,2878,2880],{"class":50,"line":51},[48,2876,2877],{"class":58},"centrality ",[48,2879,121],{"class":54},[48,2881,2882],{"class":58}," nx.eigenvector_centrality_numpy(G1)\n",[15,2884,2885],{},"Let's see which subreddit has the highest centrality:",[39,2887,2889],{"className":41,"code":2888,"language":43,"meta":44,"style":44},"print max(centrality, key=centrality.get), centrality[max(centrality, key=centrality.get)]\n",[30,2890,2891],{"__ignoreMap":44},[48,2892,2893,2895,2898,2901,2904,2906,2909,2912,2914,2916,2918],{"class":50,"line":51},[48,2894,321],{"class":320},[48,2896,2897],{"class":320}," max",[48,2899,2900],{"class":58},"(centrality, ",[48,2902,2903],{"class":203},"key",[48,2905,121],{"class":54},[48,2907,2908],{"class":58},"centrality.get), centrality[",[48,2910,2911],{"class":320},"max",[48,2913,2900],{"class":58},[48,2915,2903],{"class":203},[48,2917,121],{"class":54},[48,2919,2920],{"class":58},"centrality.get)]\n",[39,2922,2925],{"className":2923,"code":2924,"language":628},[626],"/r/imaginarybattlefields 0.0721530261127\n",[30,2926,2924],{"__ignoreMap":44},[39,2928,2930],{"className":41,"code":2929,"language":43,"meta":44,"style":44},"len(centrality) == len(sorted(centrality.values(), reverse=True))\n",[30,2931,2932],{"__ignoreMap":44},[48,2933,2934,2936,2939,2941,2944,2946,2949,2952,2955,2957,2960],{"class":50,"line":51},[48,2935,1974],{"class":320},[48,2937,2938],{"class":58},"(centrality) ",[48,2940,2100],{"class":54},[48,2942,2943],{"class":320}," len",[48,2945,285],{"class":58},[48,2947,2948],{"class":320},"sorted",[48,2950,2951],{"class":58},"(centrality.values(), ",[48,2953,2954],{"class":203},"reverse",[48,2956,121],{"class":54},[48,2958,2959],{"class":144},"True",[48,2961,669],{"class":58},[39,2963,2965],{"className":2964,"code":2258,"language":628},[626],[30,2966,2258],{"__ignoreMap":44},[15,2968,2969],{},"Since all of the centrality values are unique, we can look up nodes by their centrality values.",[39,2971,2973],{"className":41,"code":2972,"language":43,"meta":44,"style":44},"subr_list = []\nfor node in centrality:\n    subr_list.append((node, centrality[node]))\n\nsorted_subr_list = subr_list.sort(key=lambda x: x[1])\n",[30,2974,2975,2984,2995,3000,3004],{"__ignoreMap":44},[48,2976,2977,2980,2982],{"class":50,"line":51},[48,2978,2979],{"class":58},"subr_list ",[48,2981,121],{"class":54},[48,2983,1002],{"class":58},[48,2985,2986,2988,2990,2992],{"class":50,"line":68},[48,2987,308],{"class":54},[48,2989,2783],{"class":58},[48,2991,314],{"class":54},[48,2993,2994],{"class":58}," centrality:\n",[48,2996,2997],{"class":50,"line":76},[48,2998,2999],{"class":58},"    subr_list.append((node, centrality[node]))\n",[48,3001,3002],{"class":50,"line":84},[48,3003,172],{"emptyLinePlaceholder":171},[48,3005,3006,3009,3011,3014,3016,3018,3021,3024,3027,3029],{"class":50,"line":98},[48,3007,3008],{"class":58},"sorted_subr_list ",[48,3010,121],{"class":54},[48,3012,3013],{"class":58}," subr_list.sort(",[48,3015,2903],{"class":203},[48,3017,121],{"class":54},[48,3019,3020],{"class":235},"lambda",[48,3022,3023],{"class":2702}," x",[48,3025,3026],{"class":58},": x[",[48,3028,2252],{"class":144},[48,3030,1938],{"class":58},[39,3032,3034],{"className":41,"code":3033,"language":43,"meta":44,"style":44},"for x in sorted(subr_list, key=lambda x: x[1], reverse=True)[:200]: print x[0],\n",[30,3035,3036],{"__ignoreMap":44},[48,3037,3038,3040,3042,3044,3047,3050,3052,3054,3056,3058,3060,3062,3065,3067,3069,3071,3074,3077,3080,3082,3085,3087],{"class":50,"line":51},[48,3039,308],{"class":54},[48,3041,311],{"class":58},[48,3043,314],{"class":54},[48,3045,3046],{"class":320}," sorted",[48,3048,3049],{"class":58},"(subr_list, ",[48,3051,2903],{"class":203},[48,3053,121],{"class":54},[48,3055,3020],{"class":235},[48,3057,3023],{"class":2702},[48,3059,3026],{"class":58},[48,3061,2252],{"class":144},[48,3063,3064],{"class":58},"], ",[48,3066,2954],{"class":203},[48,3068,121],{"class":54},[48,3070,2959],{"class":144},[48,3072,3073],{"class":58},")[:",[48,3075,3076],{"class":144},"200",[48,3078,3079],{"class":58},"]: ",[48,3081,321],{"class":320},[48,3083,3084],{"class":58}," x[",[48,3086,1969],{"class":144},[48,3088,3089],{"class":58},"],\n",[39,3091,3094],{"className":3092,"code":3093,"language":628},[626],"/r/imaginarybattlefields /r/imaginarycityscapes /r/imaginarywastelands /r/imaginarywildlands /r/imaginaryleviathans /r/imaginarydragons /r/imaginarystarscapes /r/imaginarywesteros /r/imaginaryartifacts /r/imaginaryangels /r/imaginarymaps /r/imaginarybehemoths /r/imaginarydemons /r/imaginaryelves /r/imaginarycentaurs /r/imaginaryfuturewar /r/imaginarysoldiers /r/imaginaryhistory /r/imaginaryarmor /r/imaginarystarships /r/imaginarynetwork /r/imaginaryjedi /r/imaginarydinosaurs /r/imaginarysteampunk /r/imaginarycyberpunk /r/imaginaryarchers /r/imaginaryvehicles /r/imaginaryanime /r/imaginaryfallout /r/imaginaryastronauts /r/imaginarymusic /r/imaginaryfactories /r/imaginaryequestria /r/imaginarywarships /r/imaginaryazeroth /r/imaginaryarrakis /r/imaginarydisney /r/imaginarypolitics /r/imaginaryhorrors /r/imaginarywinterscapes /r/imaginaryseascapes /r/imaginarypirates /r/imaginarywarriors /r/imaginarymiddleearth /r/imaginarygallifrey /r/imaginarymechs /r/imaginarypropaganda /r/imaginarymerfolk /r/imaginaryvikings /r/imaginaryundead /r/imaginarybeasts /r/imaginarymutants /r/imaginaryruins /r/imaginarytamriel /r/imaginaryforests /r/imaginaryelementals /r/imaginaryskyscapes /r/imaginarymonuments /r/imaginarywaterfalls /r/imaginaryworlds /r/imaginarywizards /r/imaginaryinteriors /r/imaginaryhogwarts /r/imaginarytowers /r/imaginaryarchitecture /r/imaginaryweaponry /r/imaginarygaming /r/imaginarycastles /r/imaginaryrobotics /r/imaginarybooks /r/imaginarygnomes /r/imaginaryvillages /r/imaginarydeserts /r/imaginarywerewolves /r/imaginarydieselpunk /r/imaginaryvampires /r/imaginaryadrenaline /r/imaginarykanto /r/imaginarynatives /r/imaginaryrivers /r/imaginarytemples /r/imaginaryassassins /r/imaginaryvolcanoes /r/imaginaryclerics /r/imaginaryprisons /r/imaginarygiants /r/imaginarycowboys /r/imaginaryhumans /r/imaginarydwarves /r/imaginarycaves /r/imaginarytrolls /r/imaginarywalls /r/imaginarylakes /r/imaginarywitches /r/imaginaryorcs /r/imaginarycanyons /r/imaginaryasylums /r/imaginaryimmortals /r/imaginaryaliens /r/imaginarynobles /r/imaginaryspirits /r/imaginaryaetherpunk /r/imaginarytrees /r/imaginaryislands /r/imaginaryninjas /r/imaginaryscience /r/imaginarymountains /r/imaginaryknights /r/imaginarygoblins /r/imaginaryfaeries /r/imaginarygotham /r/imaginarycybernetics /r/imaginaryooo /r/imaginaryderelicts /r/imaginaryfood /r/imaginaryworldeaters /r/imaginarymindscapes /r/imaginaryaww /r/imaginarymarvel /r/imaginaryweather /r/imaginarynewnewyork /r/imaginaryspidey /r/imaginaryautumnscapes /r/imaginarywarhammer /r/imaginaryfeels /r/imaginarywitcher /r/imaginaryvessels /r/imaginarytaverns /r/imaginarybestof /r/imaginaryairships /r/imaginaryportals /r/imaginaryfashion /r/imaginarylovers /r/imaginarydc /r/imaginaryanimals /r/imaginaryhellscapes /r/imaginarycolorscapes /r/imaginarymonstergirls /r/imaginaryswamps /r/imaginarymythology /r/imaginaryscholars /r/imaginaryladyboners /r/imaginaryfuturism /r/imaginaryaviation /r/imaginarypathways /r/imaginarygatherings /r/imaginarybodyscapes /r/imaginaryoverwatch /r/imaginarydwellings /r/imaginarystephenking /r/specart /r/inegentlemanboners /r/comicbookart /r/imaginarymasseffect /r/imaginaryhalo /r/imaginaryjerk /r/backgroundart /r/futureporn /r/imaginarywallpapers /r/imaginaryfamilies /r/imaginarylibraries /r/imaginaryturtleworlds /r/imaginarydesigns /r/wallpapers /r/apocalypseporn /r/comicbookporn /r/isometric /r/imaginarybakerst /r/imaginaryverse /r/imaginarysunnydale /r/imaginaryfederation /r/imaginarysanctuary /r/starshipporn /r/imaginarystarcraft /r/imaginaryoldkingdom /r/imaginarynarnia /r/imaginarycybertron /r/gameworlds /r/imaginarycarnage /r/imaginaryboners /r/icandrawthat /r/imaginarycosmere /r/imaginaryaperture /r/armoredwomen /r/imaginarywtf /r/unusualart /r/imaginaryblueprints /r/alternativeart /r/sympatheticmonsters /r/adorabledragons /r/imaginarysummerscapes /r/imaginarygayboners /r/imaginarystash /r/artistoftheday /r/imaginaryglaciers /r/imaginaryhybrids /r/imaginaryadventurers /r/imaginarymetropolis /r/craftsoficeandfire /r/popartnouveau\n",[30,3095,3093],{"__ignoreMap":44},[15,3097,3098],{},"There seems to be a network of \"imaginary\" subreddits that have the highest centrality. The members of this network probably all link to themselves as well as many other subreddits as the \"imaginary\" topics span a wide range content. This network may be drowning out other nodes that would otherwise have a high centrality relative to the rest of the subreddits. It might be interesting to eliminate these nodes from the graph and recalculate centrality. Let's look at the distribution of centrality values:",[39,3100,3102],{"className":41,"code":3101,"language":43,"meta":44,"style":44},"_ = plt.plot(sorted(centrality.values(), reverse=True)[:1000])\n_ = plt.title('Subreddit Centrality (top 1000)')\n_ = plt.xlabel('Rank')\n_ = plt.ylabel('Centrality')\nplt.savefig(os.path.expanduser('~/Documents/GitHub/briancaffey.github.io/img/subreddit_graph/centrality.png'))\n",[30,3103,3104,3130,3143,3156,3169],{"__ignoreMap":44},[48,3105,3106,3108,3110,3113,3115,3117,3119,3121,3123,3125,3128],{"class":50,"line":51},[48,3107,1982],{"class":58},[48,3109,121],{"class":54},[48,3111,3112],{"class":58}," plt.plot(",[48,3114,2948],{"class":320},[48,3116,2951],{"class":58},[48,3118,2954],{"class":203},[48,3120,121],{"class":54},[48,3122,2959],{"class":144},[48,3124,3073],{"class":58},[48,3126,3127],{"class":144},"1000",[48,3129,1938],{"class":58},[48,3131,3132,3134,3136,3138,3141],{"class":50,"line":68},[48,3133,1982],{"class":58},[48,3135,121],{"class":54},[48,3137,1996],{"class":58},[48,3139,3140],{"class":132},"'Subreddit Centrality (top 1000)'",[48,3142,136],{"class":58},[48,3144,3145,3147,3149,3151,3154],{"class":50,"line":76},[48,3146,1982],{"class":58},[48,3148,121],{"class":54},[48,3150,2010],{"class":58},[48,3152,3153],{"class":132},"'Rank'",[48,3155,136],{"class":58},[48,3157,3158,3160,3162,3164,3167],{"class":50,"line":84},[48,3159,1982],{"class":58},[48,3161,121],{"class":54},[48,3163,2024],{"class":58},[48,3165,3166],{"class":132},"'Centrality'",[48,3168,136],{"class":58},[48,3170,3171,3173,3176],{"class":50,"line":98},[48,3172,2034],{"class":58},[48,3174,3175],{"class":132},"'~/Documents/GitHub/briancaffey.github.io/img/subreddit_graph/centrality.png'",[48,3177,669],{"class":58},[15,3179,3180],{},[2043,3181],{"alt":2045,"src":3182},"/static/subreddit_graph/centrality.png",[10,3184,3186],{"id":3185},"connectedness","Connectedness",[15,3188,3189],{},"Let's take a look at the graph as a whole. One thing I'm not sure of is whether or not the entire graph is connected. This means that any node can be reached from any other node. Since we constructed the graph from 49 unrelated nodes, it is possible that the graph is unconnected. This would mean that one or more of the default subreddits and its subreddits is not connected with the rest of the graph. In searching for the shortest path I did not come across any pairs of nodes that did not have a path between themselves. I wouldn't be surprised if there are a handful of nodes that stand on their own.",[39,3191,3193],{"className":41,"code":3192,"language":43,"meta":44,"style":44},"#size of graph: nodes and edges (or, subreddits and connecting links)\nprint \"Our graph has \" + str(nx.number_of_nodes(G1)) + ' nodes and ' + str(nx.number_of_edges(G1)) + ' edges.'\n",[30,3194,3195,3200],{"__ignoreMap":44},[48,3196,3197],{"class":50,"line":51},[48,3198,3199],{"class":657},"#size of graph: nodes and edges (or, subreddits and connecting links)\n",[48,3201,3202,3204,3207,3209,3211,3214,3216,3219,3221,3223,3226,3228],{"class":50,"line":68},[48,3203,321],{"class":320},[48,3205,3206],{"class":132}," \"Our graph has \"",[48,3208,148],{"class":54},[48,3210,905],{"class":281},[48,3212,3213],{"class":58},"(nx.number_of_nodes(G1)) ",[48,3215,261],{"class":54},[48,3217,3218],{"class":132}," ' nodes and '",[48,3220,148],{"class":54},[48,3222,905],{"class":281},[48,3224,3225],{"class":58},"(nx.number_of_edges(G1)) ",[48,3227,261],{"class":54},[48,3229,3230],{"class":132}," ' edges.'\n",[39,3232,3235],{"className":3233,"code":3234,"language":628},[626],"Our graph has 29854 nodes and 149491 edges.\n",[30,3236,3234],{"__ignoreMap":44},[39,3238,3240],{"className":41,"code":3239,"language":43,"meta":44,"style":44},"print \"True of False: our graph is connected... \" + str(nx.is_connected(G1)) + '!'\n",[30,3241,3242],{"__ignoreMap":44},[48,3243,3244,3246,3249,3251,3253,3256,3258],{"class":50,"line":51},[48,3245,321],{"class":320},[48,3247,3248],{"class":132}," \"True of False: our graph is connected... \"",[48,3250,148],{"class":54},[48,3252,905],{"class":281},[48,3254,3255],{"class":58},"(nx.is_connected(G1)) ",[48,3257,261],{"class":54},[48,3259,3260],{"class":132}," '!'\n",[39,3262,3265],{"className":3263,"code":3264,"language":628},[626],"True of False: our graph is connected... False!\n",[30,3266,3264],{"__ignoreMap":44},[39,3268,3270],{"className":41,"code":3269,"language":43,"meta":44,"style":44},"Gc = max(nx.connected_component_subgraphs(G1), key=len)\nprint \"The largest connected component subgraph has \" + str(nx.number_of_nodes(Gc)) + \" nodes. \"\n",[30,3271,3272,3292],{"__ignoreMap":44},[48,3273,3274,3277,3279,3281,3284,3286,3288,3290],{"class":50,"line":51},[48,3275,3276],{"class":58},"Gc ",[48,3278,121],{"class":54},[48,3280,2897],{"class":320},[48,3282,3283],{"class":58},"(nx.connected_component_subgraphs(G1), ",[48,3285,2903],{"class":203},[48,3287,121],{"class":54},[48,3289,1974],{"class":320},[48,3291,136],{"class":58},[48,3293,3294,3296,3299,3301,3303,3306,3308],{"class":50,"line":68},[48,3295,321],{"class":320},[48,3297,3298],{"class":132}," \"The largest connected component subgraph has \"",[48,3300,148],{"class":54},[48,3302,905],{"class":281},[48,3304,3305],{"class":58},"(nx.number_of_nodes(Gc)) ",[48,3307,261],{"class":54},[48,3309,3310],{"class":132}," \" nodes. \"\n",[39,3312,3315],{"className":3313,"code":3314,"language":628},[626],"The largest connected component subgraph has 29840 nodes.\n",[30,3316,3314],{"__ignoreMap":44},[15,3318,3319],{},"There are 14 nodes that are not connected to the main connected component. Let's list them.",[39,3321,3323],{"className":41,"code":3322,"language":43,"meta":44,"style":44},"for x in list(set(nx.to_dict_of_lists(G1, nodelist=None).keys()) - set(nx.to_dict_of_lists(Gc, nodelist=None).keys())): print x,\n",[30,3324,3325],{"__ignoreMap":44},[48,3326,3327,3329,3331,3333,3335,3337,3339,3342,3345,3347,3349,3352,3354,3357,3360,3362,3364,3366,3369,3371],{"class":50,"line":51},[48,3328,308],{"class":54},[48,3330,311],{"class":58},[48,3332,314],{"class":54},[48,3334,282],{"class":281},[48,3336,285],{"class":58},[48,3338,288],{"class":281},[48,3340,3341],{"class":58},"(nx.to_dict_of_lists(G1, ",[48,3343,3344],{"class":203},"nodelist",[48,3346,121],{"class":54},[48,3348,1313],{"class":144},[48,3350,3351],{"class":58},").keys()) ",[48,3353,1126],{"class":54},[48,3355,3356],{"class":281}," set",[48,3358,3359],{"class":58},"(nx.to_dict_of_lists(Gc, ",[48,3361,3344],{"class":203},[48,3363,121],{"class":54},[48,3365,1313],{"class":144},[48,3367,3368],{"class":58},").keys())): ",[48,3370,321],{"class":320},[48,3372,3373],{"class":58}," x,\n",[39,3375,3378],{"className":3376,"code":3377,"language":628},[626],"/r/spacediscussions /r/wtfit.gif /r/space. /r/subreddit_graph /r/vidalia /r/listentothis. /r/history. /r/all. /r/ghostdriver /r/personalfinance. /r/toombscounty /r/gaming /r/science /r/books.\n",[30,3379,3377],{"__ignoreMap":44},[15,3381,3382],{},"Some of the large communities on reddit include /r/books, /r/gaming and /r/science. These subreddits list related subreddits on separate wiki pages since there are many related subreddits for each one. They were most likely all captured in the subsequent levels of the graph, but they also did not link back to /r/science. Here's an example:",[39,3384,3386],{"className":41,"code":3385,"language":43,"meta":44,"style":44},"for x in master_df_u.loc[master_df_u.subreddit=='/r/physics'].related: print x\n",[30,3387,3388],{"__ignoreMap":44},[48,3389,3390,3392,3394,3396,3399,3401,3404,3407,3409],{"class":50,"line":51},[48,3391,308],{"class":54},[48,3393,311],{"class":58},[48,3395,314],{"class":54},[48,3397,3398],{"class":58}," master_df_u.loc[master_df_u.subreddit",[48,3400,2100],{"class":54},[48,3402,3403],{"class":132},"'/r/physics'",[48,3405,3406],{"class":58},"].related: ",[48,3408,321],{"class":320},[48,3410,3411],{"class":58}," x\n",[39,3413,3416],{"className":3414,"code":3415,"language":628},[626],"['/r/physicsjokes', '/r/gradadmissions', '/r/homeworkhelp', '/r/scienceimages', '/r/askacademia', '/r/physicsgifs', '/r/physicsstudents', '/r/gradschool', '/r/askphysics', '/r/physics']\n",[30,3417,3415],{"__ignoreMap":44},[15,3419,3420,3421,3426],{},"I've got some additional ideas to explore in another post on this topic, such as finding cliques and maximual cliques, and doing graph visualizations with D3.js. If you are interested in playing with the data, you can clone ",[19,3422,3425],{"href":3423,"rel":3424},"https://github.com/briancaffey/reddit-graph-analysis",[361],"my GitHub repo"," and load the pickled DataFrames like this:",[39,3428,3430],{"className":41,"code":3429,"language":43,"meta":44,"style":44},"import pandas as pd\ndf = pd.read_pickle('pickle/master_df.p')\n",[30,3431,3432,3442],{"__ignoreMap":44},[48,3433,3434,3436,3438,3440],{"class":50,"line":51},[48,3435,62],{"class":54},[48,3437,1768],{"class":58},[48,3439,92],{"class":54},[48,3441,1773],{"class":58},[48,3443,3444,3447,3449,3451,3453],{"class":50,"line":68},[48,3445,3446],{"class":58},"df ",[48,3448,121],{"class":54},[48,3450,1836],{"class":58},[48,3452,1839],{"class":132},[48,3454,136],{"class":58},[3456,3457,3458],"style",{},"html pre.shiki code .sC2Qs, html code.shiki .sC2Qs{--shiki-default:#D73A49;--shiki-dark:#F97583;--shiki-sepia:#F92672}html pre.shiki code .sMOD_, html code.shiki .sMOD_{--shiki-default:#24292E;--shiki-dark:#E1E4E8;--shiki-sepia:#F8F8F2}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html.dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html .sepia .shiki span {color: var(--shiki-sepia);background: var(--shiki-sepia-bg);font-style: var(--shiki-sepia-font-style);font-weight: var(--shiki-sepia-font-weight);text-decoration: var(--shiki-sepia-text-decoration);}html.sepia .shiki span {color: var(--shiki-sepia);background: var(--shiki-sepia-bg);font-style: var(--shiki-sepia-font-style);font-weight: var(--shiki-sepia-font-weight);text-decoration: var(--shiki-sepia-text-decoration);}html pre.shiki code .sstjo, html code.shiki .sstjo{--shiki-default:#032F62;--shiki-dark:#9ECBFF;--shiki-sepia:#E6DB74}html pre.shiki code .s7F3e, html code.shiki .s7F3e{--shiki-default:#005CC5;--shiki-dark:#79B8FF;--shiki-sepia:#AE81FF}html pre.shiki code .sTHNf, html code.shiki .sTHNf{--shiki-default:#E36209;--shiki-default-font-style:inherit;--shiki-dark:#FFAB70;--shiki-dark-font-style:inherit;--shiki-sepia:#FD971F;--shiki-sepia-font-style:italic}html pre.shiki code .sq6CD, html code.shiki .sq6CD{--shiki-default:#D73A49;--shiki-default-font-style:inherit;--shiki-dark:#F97583;--shiki-dark-font-style:inherit;--shiki-sepia:#66D9EF;--shiki-sepia-font-style:italic}html pre.shiki code .sHuvb, html code.shiki .sHuvb{--shiki-default:#22863A;--shiki-default-font-weight:bold;--shiki-dark:#85E89D;--shiki-dark-font-weight:bold;--shiki-sepia:#AE81FF;--shiki-sepia-font-weight:inherit}html pre.shiki code .sFxd3, html code.shiki .sFxd3{--shiki-default:#032F62;--shiki-dark:#DBEDFF;--shiki-sepia:#E6DB74}html pre.shiki code .sLkwE, html code.shiki .sLkwE{--shiki-default:#005CC5;--shiki-dark:#79B8FF;--shiki-sepia:#E6DB74}html pre.shiki code .s-m8C, html code.shiki .s-m8C{--shiki-default:#005CC5;--shiki-default-font-style:inherit;--shiki-dark:#79B8FF;--shiki-dark-font-style:inherit;--shiki-sepia:#66D9EF;--shiki-sepia-font-style:italic}html pre.shiki code .sTrkL, html code.shiki .sTrkL{--shiki-default:#005CC5;--shiki-dark:#79B8FF;--shiki-sepia:#66D9EF}html pre.shiki code .s8-w5, html code.shiki .s8-w5{--shiki-default:#6A737D;--shiki-dark:#6A737D;--shiki-sepia:#88846F}html pre.shiki code .srTi1, html code.shiki .srTi1{--shiki-default:#6F42C1;--shiki-dark:#B392F0;--shiki-sepia:#A6E22E}html pre.shiki code .so59x, html code.shiki .so59x{--shiki-default:#24292E;--shiki-default-font-style:inherit;--shiki-dark:#E1E4E8;--shiki-dark-font-style:inherit;--shiki-sepia:#FD971F;--shiki-sepia-font-style:italic}",{"title":44,"searchDepth":68,"depth":68,"links":3460},[],"2017-03-03","This notebook explores some basic concepts of graph theory. A few weeks ago I set up a script to scrape data from reddit.com with the goal of visualizing the network of related subreddits (forums on specific topics) and related data.",false,"md",null,"/static/subreddits.png",{"layout":3468},"post","/2017/03/03/graph_subreddit",{"title":5,"description":3462},"2017/03/03/graph_subreddit",[3473,43,3474,3475,3476],"reddit","scraping","data","graphs","pD0No-htWYTWM38CUMsXhZHHedMdq9iyf-1Y_vEF19Y",1774067637727]