Skip to content

Commit 3c616ff

Browse files
committed
Daniel Whitenack - Go For Data Science
1 parent 174c39f commit 3c616ff

File tree

7 files changed

+121956
-0
lines changed

7 files changed

+121956
-0
lines changed
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
// All material is licensed under the Apache License Version 2.0, January 2004
2+
// http://www.apache.org/licenses/LICENSE-2.0
3+
4+
// This example demonstrates how to train a regression model in Go. The example
5+
// also prints out formatted results and saves two plot: (i) a plot of the raw input
6+
// data, and (ii) a plot of the trained function overlaid on the raw input data.
7+
// The input data is data about Go github repositories gathered via getrepos.go.
8+
package main
9+
10+
import (
11+
"bytes"
12+
"encoding/csv"
13+
"log"
14+
"sort"
15+
"time"
16+
17+
"github.com/gonum/plot"
18+
"github.com/gonum/plot/plotter"
19+
"github.com/gonum/plot/plotutil"
20+
"github.com/gonum/plot/vg"
21+
"github.com/pachyderm/pachyderm/src/client"
22+
"github.com/pkg/errors"
23+
)
24+
25+
func main() {
26+
27+
// Aggregate the counts of created repos per day over all days.
28+
counts, err := prepareCountData("repodata")
29+
if err != nil {
30+
log.Fatal(err)
31+
}
32+
33+
// Create and save the plot showing the time series of
34+
// observed daily created repo counts.
35+
xys := preparePlotData(counts)
36+
if err = makePlots(xys); err != nil {
37+
log.Fatal(err)
38+
}
39+
}
40+
41+
// prepareCountData transforms the dataset into a series of
42+
// daily count values for plotting.
43+
func prepareCountData(dataSet string) ([][]int, error) {
44+
45+
// Create a map to store the daily counts of created repos.
46+
countMap := make(map[int]int)
47+
48+
// Get the data set we stored in pachyderm.
49+
data, err := getDataSet(dataSet, "master", "godata")
50+
if err != nil {
51+
return [][]int{}, errors.Wrap(err, "Could not get data from pachyderm")
52+
}
53+
54+
// Extract the records from the data.
55+
reader := csv.NewReader(bytes.NewReader(data.Bytes()))
56+
reader.FieldsPerRecord = -1
57+
records, err := reader.ReadAll()
58+
if err != nil {
59+
return [][]int{}, errors.Wrap(err, "Could not read in data records.")
60+
}
61+
62+
// Create a map of daily created repos where the keys are the days and
63+
// the values are the counts of created repos on that day.
64+
startTime := time.Date(2013, time.January, 1, 0, 0, 0, 0, time.UTC)
65+
layout := "2006-01-02 15:04:05"
66+
for _, each := range records {
67+
t, err := time.Parse(layout, each[2][0:19])
68+
if err != nil {
69+
return [][]int{}, errors.Wrap(err, "Could not parse timestamps")
70+
}
71+
interval := int(t.Sub(startTime).Hours() / 24.0)
72+
countMap[interval]++
73+
}
74+
75+
// Sort the day values which is required for plotting.
76+
var keys []int
77+
for k := range countMap {
78+
keys = append(keys, k)
79+
}
80+
sort.Ints(keys)
81+
var sortedCounts [][]int
82+
for _, k := range keys {
83+
sortedCounts = append(sortedCounts, []int{k, countMap[k]})
84+
}
85+
86+
return sortedCounts, nil
87+
}
88+
89+
// getDataSet gets a previously stored dataset from pachyderm data versioning.
90+
func getDataSet(dataSet, branch, repoName string) (bytes.Buffer, error) {
91+
92+
// Open a connection to pachyderm running on localhost.
93+
c, err := client.NewFromAddress("localhost:30650")
94+
if err != nil {
95+
return bytes.Buffer{}, errors.Wrap(err, "Could not connect to Pachyderm")
96+
}
97+
98+
// Read the latest commit of filename to the given repoName.
99+
var buffer bytes.Buffer
100+
if err := c.GetFile(repoName, branch, dataSet, 0, 0, "", nil, &buffer); err != nil {
101+
return buffer, errors.Wrap(err, "Could not retrieve pachyderm file")
102+
}
103+
104+
return buffer, nil
105+
}
106+
107+
// preparePlotData prepares the raw input data for plotting.
108+
func preparePlotData(counts [][]int) plotter.XYs {
109+
pts := make(plotter.XYs, len(counts))
110+
var i int
111+
112+
for _, count := range counts {
113+
pts[i].X = float64(count[0])
114+
pts[i].Y = float64(count[1])
115+
i++
116+
}
117+
118+
return pts
119+
}
120+
121+
// makePlots creates and saves the first of our plots showing the raw input data.
122+
func makePlots(xys plotter.XYs) error {
123+
124+
// Create a new plot.
125+
p, err := plot.New()
126+
if err != nil {
127+
return errors.Wrap(err, "Could not create plot object")
128+
}
129+
130+
// Label the new plot.
131+
p.Title.Text = "Daily Counts of Go Repos Created"
132+
p.X.Label.Text = "Days from Jan. 1, 2013"
133+
p.Y.Label.Text = "Count"
134+
135+
// Add the prepared points to the plot.
136+
if err = plotutil.AddLinePoints(p, "Counts", xys); err != nil {
137+
return errors.Wrap(err, "Could not add lines to plot")
138+
}
139+
140+
// Save the plot to a PNG file.
141+
if err := p.Save(7*vg.Inch, 4*vg.Inch, "countseries.png"); err != nil {
142+
return errors.Wrap(err, "Could not output plot")
143+
}
144+
145+
return nil
146+
}
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
// All material is licensed under the Apache License Version 2.0, January 2004
2+
// http://www.apache.org/licenses/LICENSE-2.0
3+
4+
// This example demonstrates how to scrape data about Go github repositories
5+
// using the github.com/google/go-github/github package.
6+
package main
7+
8+
import (
9+
"fmt"
10+
"log"
11+
"math"
12+
"time"
13+
14+
"github.com/google/go-github/github"
15+
"github.com/pkg/errors"
16+
)
17+
18+
func main() {
19+
20+
// Capture the date from which we will start scraping github data.
21+
t1 := time.Date(2013, time.January, 1, 0, 0, 0, 0, time.UTC)
22+
23+
// Pull the github data from t1 to now.
24+
if err := queryFromStartTime(t1); err != nil {
25+
log.Fatal(err)
26+
}
27+
}
28+
29+
// queryFromStartTime queries github for all 2 day time ranges of repo create
30+
// dates from a start time until now.
31+
func queryFromStartTime(t1 time.Time) error {
32+
33+
// Create a new github client.
34+
client := github.NewClient(nil)
35+
36+
// Loop over 2 days periods from the start time until now such that
37+
// the number of github repos we return is less than the 10 pages X
38+
// 100 results limit of the github API.
39+
for t1.Unix() < time.Now().Unix() {
40+
41+
// Create the github query string.
42+
t2 := t1.Add(time.Hour * 24 * 2)
43+
tString := fmt.Sprintf("\"%d-%02d-%02d .. %d-%02d-%02d\"",
44+
t1.Year(), t1.Month(), t1.Day(),
45+
t2.Year(), t2.Month(), t2.Day())
46+
query := fmt.Sprintf("language:Go created:" + tString)
47+
48+
// Query github with our new query.
49+
err := clientQuery(client, query)
50+
if err != nil {
51+
errors.Wrap(err, "Could not search Github repos")
52+
}
53+
54+
// Increment the start time of the 2 day chunk.
55+
t1 = t1.Add(time.Hour * 24 * 2)
56+
}
57+
58+
return nil
59+
}
60+
61+
// clientQuery executes github queries and searches over all pages of a result
62+
// set parsing results.
63+
func clientQuery(gh *github.Client, query string) error {
64+
65+
// Set the github search page we are currently searching.
66+
page := 1
67+
68+
// Set the maximum page of github results for a given query.
69+
maxPage := math.MaxInt32
70+
71+
// Set the github search options for the query.
72+
opts := github.SearchOptions{
73+
Sort: "stars",
74+
Order: "desc",
75+
ListOptions: github.ListOptions{
76+
PerPage: 100,
77+
},
78+
}
79+
80+
// Loop over pages in the github results so we can gather all
81+
// of the repositories in the search results.
82+
for page <= maxPage {
83+
84+
// We can utilized the Search.Repositories method to Query github
85+
// for a particular page of results.
86+
opts.Page = page
87+
result, response, err := gh.Search.Repositories(query, &opts)
88+
if err != nil {
89+
return errors.Wrap(err, "Could not search Github result pages")
90+
}
91+
92+
// Wait for the results.
93+
Wait(response)
94+
maxPage = response.LastPage
95+
96+
// Then we loop over the repositories in the search results.
97+
for _, repo := range result.Repositories {
98+
99+
// Extract the data of interest.
100+
name := *repo.FullName
101+
updatedAt := repo.UpdatedAt.String()
102+
createdAt := repo.CreatedAt.String()
103+
forks := *repo.ForksCount
104+
issues := *repo.OpenIssuesCount
105+
stars := *repo.StargazersCount
106+
size := *repo.Size
107+
108+
// Print out the results for now. However, this can be
109+
// redirected to a CSV file if desired.
110+
fmt.Printf("%s,%s,%s,%d,%d,%d,%d\n",
111+
name, updatedAt, createdAt, forks, issues, stars, size)
112+
}
113+
114+
// Sleep for 10 seconds to stay within Github's rate limiting
115+
// constraints, then it increments the page and query again.
116+
time.Sleep(time.Second * 10)
117+
page++
118+
}
119+
120+
return nil
121+
}
122+
123+
// Wait waits to make sure we return the full github response.
124+
func Wait(response *github.Response) {
125+
if response != nil && response.Remaining <= 1 {
126+
gap := time.Duration(response.Reset.Local().Unix() - time.Now().Unix())
127+
sleep := gap * time.Second
128+
if sleep < 0 {
129+
sleep = -sleep
130+
}
131+
time.Sleep(sleep)
132+
}
133+
}

0 commit comments

Comments
 (0)