ó
¢³[Vc           @   sã  d  d l  m Z d  d l m Z m Z m Z d  d l m Z m Z m Z m	 Z	 d  d l m
 Z
 d d l m Z d  d l m Z d  d l m Z m Z d  d	 l m Z d  d
 l m Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l m Z d  d l m Z d  d l j  Z! d  d l" Z" d  d l# m$ Z$ d  d l% m& Z& d  d l' m( Z( d „  Z) e e* ƒ Z+ e+ j, j- d ƒ e+ j, j. d d d d ƒ e) e+ ƒ Z e ƒ  Z/ e/ j0 e+ ƒ e" j1 e+ j, d ƒ Z2 e d d ƒ Z3 e d d ƒ Z4 e j ƒ  d „  ƒ Z5 d „  Z6 d d „ Z7 d d  d! „ Z8 d" „  Z9 d# „  Z: d$ „  Z; d% „  Z< e+ j= d& ƒ e+ j= d' ƒ d( „  ƒ ƒ Z> e+ j= d) ƒ d* „  ƒ Z? e+ j= d+ ƒ d, „  ƒ Z@ e+ j= d- d. d/ d0 g ƒd1 „  ƒ ZA e+ j= d2 d. d/ d0 g ƒd3 „  ƒ ZB d S(4   iÿÿÿÿ(   t   division(   t   Flaskt   gt   session(   t   render_templatet   flasht   redirectt   url_for(   t   requesti   (   t   SiteForm(   t   Session(   t   Celeryt   task(   t   MySQL(   t
   SQLAlchemyN(   t
   namedtuple(   t
   itemgetter(   t   CrawlerProcess(   t   solo(   t   UserInputPipelinec            sd   t  ˆ j d ˆ j d ƒ} | j j ˆ j ƒ | j ‰  d ˆ  f ‡  ‡ f d †  ƒ  Y} | | _ | S(   sT   
    Init celery environment with Redis broker. Allows use of celery decorator.
    t   brokert   CELERY_BROKER_URLt   ContextTaskc              s    e  Z e Z ‡  ‡ f d  †  Z RS(   c            s*   ˆ j  ƒ   ˆ  j |  | | Ž SWd  QXd  S(   N(   t   app_contextt   __call__(   t   selft   argst   kwargs(   t   TaskBaset   app(    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyR   )   s    (   t   __name__t
   __module__t   Truet   abstractR   (    (   R   R   (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyR   '   s   (   R   t   import_namet   configt   conft   updatet   Task(   R   t   celeryR   (    (   R   R   sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   make_celery   s    		R#   R   s   redis://localhost:6379t   CELERY_RESULT_BACKENDt   CORTIPY_API_KEYt   Sites   siteurl textt   CorticalSites!   siteurl text fingerprint keywordsc         C   sZ   t  i d d 6i d d 6d 6d d 6i d d 6d	 6ƒ } | j t j d
 |  ƒ| j ƒ  d S(   sZ   
    Initializes Scrapy crawler for given siteurl. See spider/pipeline code for more.
    s2   Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)t
   USER_AGENTid   s1   app.vcspider.vcspider.pipelines.UserInputPipelinet   ITEM_PIPELINESi   t   DEPTH_LIMITt   s3t   DOWNLOAD_HANDLERSt   domainN(   R   t   Nonet   crawlR   t
   SoloSpidert   start(   t   siteurlt   process(    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   scrapeE   s    c   	      C   sU  t  j ƒ  } | j ƒ  } d } | j | |  f ƒ | j ƒ  } | j ƒ  | d k rÙ t d ƒ d } | j | |  f ƒ | j ƒ  \ } } } } | j ƒ  t j	 | ƒ } t
 | t ƒ rÃ | d } n  t |  | | | g ƒ St d j |  ƒ ƒ t j |  ƒ } xF | d k rC| j | |  f ƒ | j ƒ  } | j ƒ  t j d ƒ qþ Wt |  | ƒ Sd S(   sM   
    Either scrapes text of input site, or returns already-scraped data.
    s7   SELECT text FROM crunchbase_startups WHERE siteurl = %ss1   Site already scraped, proceeding with analysis...st   SELECT siteurl, text, cortical_io, cortical_io_keywords 
                FROM crunchbase_startups WHERE siteurl = %st	   positionss   Trying to scrape site: {}i   N(   t   mysqlt   connectt   cursort   executet   fetchonet   commitR3   R   t   astt   literal_evalt
   isinstancet   dictt   CSitet   formatR9   t   delayt   timet   sleepR+   (	   R7   t   cont   curt   sqlt   textt   _t   fingerprintt   keywordst   scraper(    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   get_siteT   s0    


 
t   euclideanDistancec         C   s   t  j |  j | j ƒ | S(   s+   
    Now just for QA, get API distance
    (   t   clientt   compareRO   (   t   site1t   site2t   metric(    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt
   getSDRDist}   s    t	   TextInputi    c         C   så   t  j | |  d g d ƒ } t  j |  d ƒ } t j ƒ  } | j ƒ  } | d k r„ d } | j | t | ƒ d j | ƒ | f ƒ n= | d k rÁ d } | j | |  t | ƒ d j | ƒ f ƒ n  | j	 ƒ  t
 | |  | d | g ƒ S(   sF   
    Makes an SDR and associated keywords for input site or text.
    i    t    s~   UPDATE crunchbase_startups
                SET cortical_io = %s, cortical_io_keywords = %s
                WHERE  siteurl = %st   ,i   sj   INSERT INTO sitedescriptions (text, cortical_io, cortical_io_keywords)
                VALUES (%s, %s, %s)R:   (   RT   t   createClassificationt   extractKeywordsR;   R<   R=   R>   t   strt   joinR@   RE   (   RM   R7   t   isTextt   site_corticalmapt   site_keywordsRJ   RK   RL   (    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   makeSDR„   s    ++
c          C   sE   t  j ƒ  }  |  j ƒ  } d } | j | ƒ |  j ƒ  | j ƒ  } | S(   s/   Get all VCs for comparison to selected startup.s¸   SELECT siteurl, cortical_io, cortical_io_keywords 
    FROM vctest4
    WHERE 
        NULLIF(text, '') IS NOT NULL 
        AND NULLIF(cortical_io, '') IS NOT NULL
    ORDER BY RAND()(   R;   R<   R=   R>   R@   t   fetchall(   RJ   RK   RL   t   vcList(    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt
   loadVCList¡   s    
c         C   s:   t  |  ƒ t  | ƒ } t  t |  ƒ t | ƒ Aƒ } | | S(   s  The Euclidean distance is defined as a float between 0 and 1, 0 representing a 
    smaller distance and thus closer match between two SDRs. It is calculated as 
    the quotient of the length of the symmetric difference between two sets and the
    total length of the combined sets.(   t   lent   set(   t   l1t   l2t   totlent   sublen(    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   eucDist·   s    c         C   sC   t  t |  ƒ t | ƒ @ƒ } t  |  ƒ t  | ƒ } | t j | ƒ S(   s  Here (and only here, per cortical.io), cosine similarity is defined as a float 
    between 0 and 1, 1 representing a high similarity. It is calculated as the 
    quotient of the length of the number of bits contained in both sets and the square 
    root of len(l1) * len(l2).(   Rh   Ri   t   npt   sqrt(   Rj   Rk   t   overlapt   totsize(    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   cosSimÃ   s    c         C   s  t  ƒ  } t ƒ  t ƒ  } } x† | D]~ } t |  j t j | d ƒ ƒ } | j t | d | g ƒ ƒ t |  j t j | d ƒ ƒ } | j t | d | g ƒ ƒ q# Wt	 | d t
 d ƒ ƒ} t	 | d t
 d ƒ d t ƒ} t d j | d  ƒ ƒ t d j | d  ƒ ƒ t | | f ƒ S(   sà   Find top three matches for the given startup based on both Euclidean distance
    and Cosine similarity (calculated per cortical's API, not really mappable to 
    traditional definitions, ESPECIALLY cosine similarity.)
    i   i    t   keyt   reverses   Euclidean: {}i   s
   Cosine: {}(   Rg   t   listRn   RO   RA   RB   t   appendt   tupleRs   t   sortedR   R    R   RF   (   t   startupRf   t	   scoresEuct	   scoresCost   vct   scoreEuct   scoreCos(    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   getMatchÎ   s    	!t   /s   /indexc        
   C   s"   t  d d d d d d d d d ƒS(	   Ns
   index.htmlt   titlet   Homet   h1t   activet   h2t   inactivet   h3(   R   (    (    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   indexì   s
    	s   /aboutc        
   C   s"   t  d d d d d d d d d ƒS(	   Ns
   about.htmlR‚   t   AboutR„   R‡   R†   R…   Rˆ   (   R   (    (    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   aboutõ   s
    	s   /contactc        
   C   s"   t  d d d d d d d d d ƒS(	   Ns   contact.htmlR‚   t   ContactR„   R‡   R†   Rˆ   R…   (   R   (    (    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   contactý   s
    	s   /processt   methodst   GETt   POSTc       
   C   sö   t  j j d ƒ }  |  d k rN t  j j d ƒ } t t | f ƒ d d ƒ} nT |  d  k r¢ t  j j d ƒ } t | ƒ } t | t ƒ r¢ t | j	 | j
 ƒ } q¢ n  t d j | j
 | j ƒ ƒ t d ƒ t | ƒ } t d d	 d
 d d d d d d ƒS(   Nt   descrt   yt	   siteinputRa   i   s   Keywords for site {}: {}s   Getting best match...s   process.htmlR‚   t   ResultsR„   R‡   R†   Rˆ   (   R   R   t   getRd   Rx   R3   RR   RC   R+   RM   R7   R   RF   RP   R€   R   (   R‘   R“   t   sitedataR7   t   matches(    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyR8     s    
	s   /inputc          C   sb   t  t j ƒ }  t j d k r: |  j ƒ  r: t t d ƒ ƒ St d d d d d d d d	 d d
 |  ƒS(   NR   R8   s
   input.htmlR‚   s
   Enter siteR„   R‡   R†   Rˆ   t   form(   R	   R   R˜   t   methodt   validateR   R   R   (   R˜   (    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   input   s    	(C   t
   __future__R    t   flaskR   R   R   R   R   R   R   R   t   formsR	   t   flask.ext.sessionR
   R'   R   R   t   flask.ext.mysqlR   t   flask.ext.sqlalchemyR   t   reRA   RH   t   numpyRo   t   collectionsR   t   operatorR   t   mysql.connectort	   connectort   msct   cortipyt   scrapy.crawlerR   t   vcspider.vcspider.spidersR   t   vcspider.vcspider.pipelinesR   R(   R   R   R#   t   from_objectR%   R;   t   init_appt   CorticalClientRT   R+   RE   R9   RR   RY   Rd   Rg   Rn   Rs   R€   t   routeR‰   R‹   R   R8   R›   (    (    (    sX   /Users/johnmontroy/Documents/Learning/NYC-DSA/Project 3/banker.ai/mcubed/app/__init__.pyt   <module>   sZ   "			)				$